diff --git a/API/audio_api.py b/API/audio_api.py index fd8256b..4645f37 100644 --- a/API/audio_api.py +++ b/API/audio_api.py @@ -2,19 +2,24 @@ """ API/audio_api.py — Marcus Audio API Layer ========================================== -Provides speak() and record_audio() for the Brain layer. +Provides speak() and record() for the Brain layer. Brain imports ONLY from this API — never from unitree SDK directly. -Speaker: _CallRequestWithParamAndBin (single call, full buffer) -Mic: parec -d 3 (Hollyland wireless, PulseAudio source index from config) -TTS EN: Unitree built-in TtsMaker -TTS AR: Piper ar_JO-kareem-medium → resample → G1 speaker +Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only, + no MP3/WAV plumbing, no internet). Optional raw-PCM playback path + via _play_pcm() is kept for future modules that synthesize their + own audio (e.g. offline Piper). +Mic: G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono). + Legacy Hollyland/parec path retained as fallback when + config_Voice.json has mic.backend="pactl_parec". +TTS: English only. Arabic is rejected (the G1 firmware silently maps + Arabic to Chinese, which confuses everyone — if Arabic TTS is ever + needed again, use a separate offline backend like Piper). Usage: from API.audio_api import AudioAPI audio = AudioAPI() - audio.speak("Hello", "en") - audio.speak("مرحبا", "ar") + audio.speak("Hello, I am Sanad") recording = audio.record(seconds=5) audio.play_pcm(recording) """ @@ -71,7 +76,24 @@ class AudioAPI: self._tts = self._config["tts"] self._mic = self._config["mic"] self._spk = self._config["speaker"] - self._target_rate = self._tts["target_sample_rate"] + self._target_rate = self._tts.get("target_sample_rate", 16000) + + # Default mic backend: G1 built-in UDP multicast. + # Set mic.backend="pactl_parec" in config_Voice.json to fall back + # to the legacy Hollyland/PulseAudio path. + self._mic_backend = self._mic.get("backend", "builtin_udp") + self._builtin_mic = None # lazy-initialized on first record() + + # Built-in TTS wrapper (uses the already-initialized AudioClient). + # Keeps TTS synchronous so `is_speaking` is meaningful to the voice + # loop that needs to skip mic input during playback. + self._tts_engine = None + if self._sdk_available: + from Voice.builtin_tts import BuiltinTTS + self._tts_engine = BuiltinTTS( + self._client, + default_speaker_id=self._tts.get("builtin_speaker_id", 0), + ) # Data dir data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"]) @@ -82,7 +104,10 @@ class AudioAPI: self._speaking = False self._speak_lock = threading.Lock() - log.info(self._config["messages"]["ready"]) + log.info("%s (mic=%s, tts=%s)", + self._config["messages"]["ready"], + self._mic_backend, + "builtin_ttsmaker" if self._tts_engine else "disabled") def _init_sdk(self): """Initialize Unitree AudioClient.""" @@ -105,55 +130,63 @@ class AudioAPI: # ─── SPEAK ──────────────────────────────────────────── - def speak(self, text: str, lang: str = "auto"): + def speak(self, text: str, lang: str = "en"): """ - Speak text in the given language. - Mutes mic during playback to prevent self-listening. - lang="en" → built-in TtsMaker - lang="ar" → Piper → resample → G1 speaker - lang="auto" → detect from text - """ - if lang == "auto": - lang = self._detect_lang(text) + Speak `text` in English through the G1 built-in TTS (TtsMaker). - log.info("[%s] speak: %s", lang.upper(), text[:80]) + Mutes (flushes) the mic during playback so the voice loop doesn't + hear the robot's own voice and transcribe itself. The `lang` + argument is accepted for API compatibility but only "en" plays — + non-ASCII text (Arabic) is rejected by BuiltinTTS. + """ + if lang and lang not in ("en", "auto"): + log.warning("builtin_tts only supports English; got lang=%r — skipping", lang) + return + if self._tts_engine is None: + log.error("No TTS engine initialized — audio SDK unavailable") + return + + log.info("speak: %s", text[:80]) with self._speak_lock: self._speaking = True self._mute_mic() - try: - if lang == "en": - self._speak_english(text) - elif lang == "ar": - self._speak_arabic(text) - else: - log.warning("Unknown lang '%s', falling back to English", lang) - self._speak_english(text) + self._tts_engine.speak(text, block=True) except Exception as e: log.error("%s: %s", self._config["messages"]["error_tts"], e) finally: - # Small delay so speaker fully stops before mic reopens - time.sleep(0.3) + # Small tail so the speaker fully finishes before the mic is + # re-opened for capture + time.sleep(0.2) self._unmute_mic() self._speaking = False def _mute_mic(self): - """Mute the wireless mic to prevent self-listening.""" + """ + Suppress mic input during TTS playback. + For the UDP built-in mic, flush the buffer so we don't capture any + echo that's already been queued. For the legacy PulseAudio path, + actually mute the source. + """ + if self._mic_backend == "builtin_udp": + if self._builtin_mic is not None: + self._builtin_mic.flush() + return source = self._mic["source_index"] - subprocess.run( - ["pactl", "set-source-mute", source, "1"], - capture_output=True, - ) + subprocess.run(["pactl", "set-source-mute", source, "1"], + capture_output=True) log.debug("Mic muted") def _unmute_mic(self): - """Unmute the wireless mic.""" + """Re-enable mic after TTS playback (pactl path only).""" + if self._mic_backend == "builtin_udp": + if self._builtin_mic is not None: + self._builtin_mic.flush() + return source = self._mic["source_index"] - subprocess.run( - ["pactl", "set-source-mute", source, "0"], - capture_output=True, - ) + subprocess.run(["pactl", "set-source-mute", source, "0"], + capture_output=True) log.debug("Mic unmuted") @property @@ -161,88 +194,8 @@ class AudioAPI: """True while TTS is playing — voice module checks this.""" return self._speaking - def _speak_english(self, text: str): - """English TTS via edge-tts.""" - self._speak_edge_tts(text, "en") - - def _speak_arabic(self, text: str): - """Arabic TTS via edge-tts.""" - self._speak_edge_tts(text, "ar") - - def speak_piper_en(self, text: str): - """Alternative: English via Piper instead of built-in.""" - voice = self._tts["piper_voice_en"] - audio, rate = self._piper_synthesize(text, voice) - audio_16k = self._resample(audio, rate) - self._play_pcm(audio_16k) - - # ─── PIPER TTS ──────────────────────────────────────── - - def _piper_synthesize(self, text: str, voice: str) -> tuple: - """Run Piper CLI, return (audio_int16, sample_rate).""" - cmd = ["piper", "--model", voice, "--output_raw"] - timeout = self._tts["piper_timeout_sec"] - - proc = subprocess.run( - cmd, - input=text.encode("utf-8"), - capture_output=True, - timeout=timeout, - ) - - if proc.returncode != 0: - stderr = proc.stderr.decode()[:300] - raise RuntimeError(f"Piper failed: {stderr}") - - audio = np.frombuffer(proc.stdout, dtype=np.int16) - piper_rate = self._tts["piper_sample_rate"] - log.info("Piper: %d samples @ %dHz (%.1fs)", len(audio), piper_rate, len(audio) / piper_rate) - return audio, piper_rate - - # ─── RESAMPLE ───────────────────────────────────────── - - - def _speak_edge_tts(self, text: str, lang: str): - """Generate speech via edge-tts and play on G1.""" - import os as _os - voice = "ar-AE-HamdanNeural" if lang == "ar" else "en-US-GuyNeural" - ts = int(time.time() * 1000) - mp3_path = f"/tmp/edge_{lang}_{ts}.mp3" - wav_path = f"/tmp/edge_{lang}_{ts}.wav" - - safe_text = text.replace('"', '\\"') - code = f'import edge_tts, asyncio; asyncio.run(edge_tts.Communicate(\"{safe_text}\", voice=\"{voice}\").save(\"{mp3_path}\"))' - result = subprocess.run(["python3", "-c", code], capture_output=True, text=True, timeout=30) - - if result.returncode != 0: - log.error("edge-tts failed: %s", result.stderr[:200]) - if lang == "en" and self._sdk_available: - self._client.TtsMaker(text, self._tts.get("builtin_speaker_id", 1)) - time.sleep(max(2.0, len(text) * 0.06)) - return - - try: - from pydub import AudioSegment - a = AudioSegment.from_mp3(mp3_path) - a = a.set_frame_rate(16000).set_channels(1).set_sample_width(2) - a.export(wav_path, format="wav") - - import wave - with wave.open(wav_path, "rb") as wf: - audio = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16) - - _os.unlink(mp3_path) - _os.unlink(wav_path) - self._play_pcm(audio) - except Exception as e: - log.error("edge-tts conversion error: %s", e) - try: _os.unlink(mp3_path) - except: pass - try: _os.unlink(wav_path) - except: pass - def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray: - """Resample to target rate (16kHz).""" + """Linear resample int16 PCM to self._target_rate (16 kHz).""" if src_rate == self._target_rate: return audio tl = int(len(audio) * self._target_rate / src_rate) @@ -252,7 +205,7 @@ class AudioAPI: audio.astype(np.float64), ).astype(np.int16) - # ─── G1 SPEAKER PLAYBACK ───────────────────────────── + # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ───────── def _play_pcm(self, audio_16k: np.ndarray) -> float: """Play 16kHz mono int16 on G1 speaker. Returns duration.""" @@ -308,24 +261,50 @@ class AudioAPI: # ─── MIC RECORDING ─────────────────────────────────── def record(self, seconds: float = 5.0) -> np.ndarray: - """Record from Hollyland wireless mic via parec. Returns int16 array.""" + """ + Capture `seconds` of int16 mono 16 kHz PCM. + + Default backend is the G1 built-in mic (UDP multicast). Set + mic.backend="pactl_parec" in config_Voice.json to use the + legacy Hollyland/parec path instead. + """ + if self._mic_backend == "builtin_udp": + return self._record_builtin(seconds) + return self._record_parec(seconds) + + def _record_builtin(self, seconds: float) -> np.ndarray: + """Built-in mic path — join UDP multicast, read the requested duration.""" + if self._builtin_mic is None: + from Voice.builtin_mic import BuiltinMic + mcfg = self._config.get("mic_udp", {}) + self._builtin_mic = BuiltinMic( + group=mcfg.get("group", "239.168.123.161"), + port=mcfg.get("port", 5555), + buf_max=mcfg.get("buffer_max_bytes", 64000), + ) + self._builtin_mic.start() + time.sleep(0.2) # let the receiver thread fill in + + log.info("Recording %.1fs from G1 built-in mic", seconds) + raw = self._builtin_mic.read_seconds(seconds) + audio = np.frombuffer(raw, dtype=np.int16) + log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std()) + if audio.std() < 50: + log.warning(self._config["messages"]["error_mic"] + + " — G1 mic silent (check audio service on robot)") + return audio + + def _record_parec(self, seconds: float) -> np.ndarray: + """Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'.""" source = self._mic["source_index"] rate = str(self._mic["rate"]) channels = str(self._mic["channels"]) fmt = self._mic["format"] - # Unmute mic - subprocess.run( - ["pactl", "set-source-mute", source, "0"], - capture_output=True, - ) - subprocess.run( - ["pactl", "set-source-volume", source, "100%"], - capture_output=True, - ) - - log.info("Recording %.1fs from mic source %s", seconds, source) + subprocess.run(["pactl", "set-source-mute", source, "0"], capture_output=True) + subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True) + log.info("Recording %.1fs from mic source %s (parec)", seconds, source) proc = subprocess.Popen( ["parec", "-d", source, f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"], @@ -337,10 +316,8 @@ class AudioAPI: audio = np.frombuffer(raw, dtype=np.int16) log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std()) - if audio.std() < 50: log.warning(self._config["messages"]["error_mic"] + " — mic may be silent") - return audio def save_recording(self, audio: np.ndarray, name: str) -> str: @@ -355,16 +332,6 @@ class AudioAPI: log.info("Saved: %s", path) return path - # ─── LANGUAGE DETECTION ─────────────────────────────── - - @staticmethod - def _detect_lang(text: str) -> str: - """Detect language from text — Arabic Unicode range check.""" - for c in text: - if '\u0600' <= c <= '\u06FF': - return "ar" - return "en" - # ─── STATUS ─────────────────────────────────────────── @property @@ -378,27 +345,16 @@ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Marcus Audio API Test") - parser.add_argument("--test", action="store_true", help="Run speak tests") - parser.add_argument("--speak", type=str, help="Speak this text") - parser.add_argument("--lang", default="auto", help="Language: en, ar, auto") + parser.add_argument("--test", action="store_true", help="Run TTS + record test") + parser.add_argument("--speak", type=str, help="Speak this English text") parser.add_argument("--record", type=float, default=0, help="Record N seconds") args = parser.parse_args() api = AudioAPI() if args.test: - print("\n--- English built-in ---") - api.speak("Hello, I am Marcus.", "en") - time.sleep(1) - - print("\n--- Arabic Piper ---") - api.speak("مرحبا، أنا ماركوس", "ar") - time.sleep(1) - - print("\n--- Auto-detect ---") - api.speak("How are you?") - time.sleep(1) - api.speak("كيف حالك؟") + print("\n--- English (TtsMaker) ---") + api.speak("Hello, I am Sanad.") time.sleep(1) print("\n--- Record 3s + playback ---") @@ -408,7 +364,7 @@ if __name__ == "__main__": print("\nDone.") elif args.speak: - api.speak(args.speak, args.lang) + api.speak(args.speak) elif args.record > 0: rec = api.record(args.record) diff --git a/API/yolo_api.py b/API/yolo_api.py index 1495c4c..e6e1d64 100644 --- a/API/yolo_api.py +++ b/API/yolo_api.py @@ -49,9 +49,28 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool: print(f"marcus_yolo.py not found ({e})") return False - # GPU is required — let RuntimeError from _resolve_device propagate so - # Marcus hard-fails at startup instead of silently running without vision. - ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock) + # GPU is required. _resolve_device() raises RuntimeError when CUDA is + # missing — surface that with an actionable banner before re-raising so + # Marcus hard-fails with a clear error instead of a raw stack trace. + try: + ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock) + except RuntimeError as e: + print() + print("╔" + "═" * 68 + "╗") + print("║ MARCUS STARTUP ABORTED — GPU REQUIRED".ljust(69) + "║") + print("╠" + "═" * 68 + "╣") + print(f"║ {str(e)[:66]:<66} ║") + print("║" + " " * 68 + "║") + print("║ On the Jetson, verify:".ljust(69) + "║") + print("║ tegrastats # GPU exists & is not throttled".ljust(69) + "║") + print("║ python3 -c 'import torch; print(torch.cuda.is_available())'".ljust(69) + "║") + print("║ nvcc --version # CUDA toolkit reachable".ljust(69) + "║") + print("║ Expected: torch 2.1.0 nv23.06, CUDA 11.4, GPU=Orin.".ljust(69) + "║") + print("║ See Doc/environment.md section 9 for the reinstall recipe.".ljust(69) + "║") + print("╚" + "═" * 68 + "╝") + print() + raise + if ok: YOLO_AVAILABLE = True yolo_sees = _ys diff --git a/API/zmq_api.py b/API/zmq_api.py index de7a0c0..adeb91b 100644 --- a/API/zmq_api.py +++ b/API/zmq_api.py @@ -1,7 +1,16 @@ """ zmq_api.py — ZMQ velocity + command interface to Holosoma + +Previously the PUB socket was bound at module import time. That made the +module unsafe to re-import from any multiprocessing child (e.g. the LiDAR +SLAM_worker spawn), because the child would try to rebind the same port +and crash with `Address already in use`. + +The bind now lives in init_zmq() — call it once from the brain entrypoint. +Child processes can import this module without any network side effects. """ import json +import os import time import zmq from Core.config_loader import load_config @@ -15,35 +24,62 @@ STOP_ITERATIONS = _cfg["stop_iterations"] STOP_DELAY = _cfg["stop_delay"] STEP_PAUSE = _cfg["step_pause"] -ctx = zmq.Context() -sock = ctx.socket(zmq.PUB) -sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}") -time.sleep(0.5) -log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT}", "info", "zmq") +# Shared state. These stay None until init_zmq() is called. +ctx: zmq.Context = None +sock: zmq.Socket = None +_INIT_SETTLE = 0.5 # seconds to let PUB tell subscribers it's alive + + +def init_zmq() -> zmq.Socket: + """ + Bind the PUB socket. Idempotent — safe to call more than once. + Call from the main (parent) process only. Do NOT call from multiprocessing + children — they inherit nothing useful from the bound socket anyway. + """ + global ctx, sock + if sock is not None: + return sock + ctx = zmq.Context() + sock = ctx.socket(zmq.PUB) + sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}") + time.sleep(_INIT_SETTLE) + log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT} (pid={os.getpid()})", + "info", "zmq") + return sock + + +def _ensure_sock() -> zmq.Socket: + if sock is None: + raise RuntimeError( + "zmq_api not initialized — call init_zmq() from the brain " + "entrypoint before using send_vel/send_cmd/gradual_stop" + ) + return sock def get_socket(): """Return the shared ZMQ PUB socket (for odometry to reuse).""" - return sock + return _ensure_sock() def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0): """Send velocity to Holosoma. vx m/s | vy m/s | vyaw rad/s""" - sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}})) + _ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}})) def gradual_stop(): """Smooth deceleration to zero over ~1 second.""" + s = _ensure_sock() for _ in range(STOP_ITERATIONS): - send_vel(0.0, 0.0, 0.0) + s.send_string(json.dumps({"vel": {"vx": 0.0, "vy": 0.0, "vyaw": 0.0}})) time.sleep(STOP_DELAY) def send_cmd(cmd: str): """Send Holosoma state command: start | walk | stand | stop""" - sock.send_string(json.dumps({"cmd": cmd})) + _ensure_sock().send_string(json.dumps({"cmd": cmd})) -# Load MOVE_MAP from navigation config +# Load MOVE_MAP from navigation config (pure data, safe at import time) _nav = load_config("Navigation") MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()} diff --git a/Autonomous/marcus_autonomous.py b/Autonomous/marcus_autonomous.py index f4c2849..1cc321f 100644 --- a/Autonomous/marcus_autonomous.py +++ b/Autonomous/marcus_autonomous.py @@ -292,7 +292,10 @@ class AutonomousMode: self._enabled = False break - time.sleep(YOLO_CHECK_INTERVAL) + # No trailing sleep — _move_forward() takes FORWARD_DURATION, + # _turn() takes TURN_DURATION, and LLaVA assessment is ~1-2s. + # The body always consumes real wall time, so an extra sleep here + # would be pure dead time. # Clean up self._gradual_stop() diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py index 27fd78e..86c6920 100644 --- a/Brain/marcus_brain.py +++ b/Brain/marcus_brain.py @@ -17,7 +17,7 @@ PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if PROJECT_DIR not in sys.path: sys.path.insert(0, PROJECT_DIR) -from API.zmq_api import send_vel, gradual_stop, send_cmd +from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd from API.camera_api import start_camera, stop_camera, get_frame from API.yolo_api import ( init_yolo, yolo_summary, yolo_fps, @@ -70,7 +70,19 @@ _NAT_GOAL_RE = re.compile( # ══════════════════════════════════════════════════════════════════════════════ def init_brain(): - """Initialize all subsystems. Call once at startup.""" + """Initialize all subsystems. Call once at startup from the parent process. + + Optional subsystems (lidar / voice / imgsearch / autonomous) are gated on + `config_Brain.json::subsystems.`. Disabling the ones you don't need + brings Marcus's boot time down from ~18 s to ~5-7 s. + """ + subsys = _cfg.get("subsystems", {}) or {} + + # Bind the ZMQ PUB socket before anything tries to publish on it. + # This is now explicit (previously it happened as an import side effect, + # which crashed every multiprocessing child that re-imported zmq_api). + init_zmq() + raw_frame, raw_lock = start_camera() init_yolo(raw_frame, raw_lock) @@ -79,53 +91,65 @@ def init_brain(): init_memory() - # LiDAR (optional — continues without it) - try: - from API.lidar_api import init_lidar - init_lidar() - except Exception as e: - print(f" [LiDAR] Init failed: {e} — continuing without LiDAR") + # LiDAR — optional + if subsys.get("lidar", True): + try: + from API.lidar_api import init_lidar + init_lidar() + except Exception as e: + print(f" [LiDAR] Init failed: {e} — continuing without LiDAR") + else: + print(" [LiDAR] disabled by config") - init_imgsearch( - get_frame_fn=get_frame, - send_vel_fn=send_vel, - gradual_stop_fn=gradual_stop, - llava_fn=call_llava, - yolo_sees_fn=yolo_sees, - model=OLLAMA_MODEL, - ) + # Image search — optional + if subsys.get("imgsearch", False): + init_imgsearch( + get_frame_fn=get_frame, + send_vel_fn=send_vel, + gradual_stop_fn=gradual_stop, + llava_fn=call_llava, + yolo_sees_fn=yolo_sees, + model=OLLAMA_MODEL, + ) + else: + print(" [ImgSearch] disabled by config") - # Autonomous exploration mode - from API.memory_api import mem as _mem_ref - from API.llava_api import PATROL_PROMPT - auto = AutonomousMode( - get_frame_fn=get_frame, - send_vel_fn=send_vel, - gradual_stop_fn=gradual_stop, - yolo_sees_fn=yolo_sees, - yolo_summary_fn=yolo_summary, - yolo_all_classes_fn=yolo_all_classes, - yolo_closest_fn=yolo_closest, - odom_fn=lambda: {"x": 0, "y": 0, "heading": 0}, # fallback if no odom - call_llava_fn=call_llava, - patrol_prompt=PATROL_PROMPT, - mem=_mem_ref, - ) - # Wire odometry if available - from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE - if _odom_ref and ODOM_AVAILABLE: - auto._odom_pos = lambda: { - "x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading - } - init_autonomous(auto) + # Autonomous exploration mode — optional + if subsys.get("autonomous", True): + from API.memory_api import mem as _mem_ref + from API.llava_api import PATROL_PROMPT + auto = AutonomousMode( + get_frame_fn=get_frame, + send_vel_fn=send_vel, + gradual_stop_fn=gradual_stop, + yolo_sees_fn=yolo_sees, + yolo_summary_fn=yolo_summary, + yolo_all_classes_fn=yolo_all_classes, + yolo_closest_fn=yolo_closest, + odom_fn=lambda: {"x": 0, "y": 0, "heading": 0}, + call_llava_fn=call_llava, + patrol_prompt=PATROL_PROMPT, + mem=_mem_ref, + ) + from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE + if _odom_ref and ODOM_AVAILABLE: + auto._odom_pos = lambda: { + "x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading + } + init_autonomous(auto) + else: + print(" [Autonomous] disabled by config") send_cmd("start") time.sleep(0.5) send_cmd("walk") time.sleep(0.5) - # Voice module (optional — continues without it) - _init_voice() + # Voice module — optional + if subsys.get("voice", True): + _init_voice() + else: + print(" [Voice] disabled by config") _log("Brain initialized", "info", "brain") _warmup_llava() @@ -137,44 +161,37 @@ _voice_module = None def _init_voice(): - """Initialize voice module — runs in background, calls process_command on speech.""" + """ + Initialize the voice subsystem: G1 built-in mic + Whisper STT + G1 + built-in TtsMaker for replies. Every transcribed command flows through + process_command(), and the resulting `speak` string is sent to the G1 + speaker. + """ global _audio_api, _voice_module try: from API.audio_api import AudioAPI - from Voice.marcus_gemini_voice import GeminiVoiceModule as VoiceModule + from Voice.marcus_voice import VoiceModule _audio_api = AudioAPI() - def _voice_callback(text, role): - """Gemini voice callback.""" - pass # handled below - if role != "user" or not text.strip(): + def _on_command(text, lang): + text = (text or "").strip() + if not text: return - t = text.strip().lower() - act_kw = ["turn","move","go","walk","step","stop","come","wave","clap", - "high five","shake","hug","forward","backward","left","right", - "what do you see","what can you see","look","describe","patrol", - "دور","امشي","روح","تقدم","ارجع","وقف","قف","تعال", - "يمين","يسار","قدام","ورا","لوح","صفق","سلم", - "شو شايف","شو تشوف","ماذا ترى","شو قدامك","لف","خطوات"] - if any(kw in t for kw in act_kw): - print(f" [Brain] Action: {text.strip()}") - try: - result = process_command(text.strip()) - if isinstance(result, dict): - sp = result.get("speak", "") - vis_kw = ["see","look","describe","شايف","تشوف","ترى","قدامك"] - if any(k in t for k in vis_kw) and sp and _audio_api: - print(f" [Brain] Vision: {sp}") - _audio_api.speak(sp) - except Exception as e: - print(f" [Brain] Error: {e}") - else: - print(f" [Chat] {text.strip()}") + print(f" [Voice] {text}") + try: + result = process_command(text) + except Exception as e: + print(f" [Brain] Error processing voice command: {e}") + return + if isinstance(result, dict): + sp = (result.get("speak") or "").strip() + if sp and _audio_api: + _audio_api.speak(sp) - _voice_module = VoiceModule(_audio_api, on_transcript=_voice_callback) + _voice_module = VoiceModule(_audio_api, on_command=_on_command) _voice_module.start() - print(f" [Voice] Always listening (Gemini voice)") + print(" [Voice] Always listening (Whisper + G1 mic + TtsMaker)") except Exception as e: print(f" [Voice] Init failed: {e} — continuing without voice") _audio_api = None @@ -255,7 +272,7 @@ def process_command(cmd: str) -> dict: # ── Greeting ───────────────────────────────────────────────────────── if re.match(r"^(?:hi+|hey+|hello+|hola|salam|marhaba|sup|yo+|ahlan|السلام عليكم|مرحبا|أهلا|هلا|يا هلا)\s*[!.]*$", cmd, re.IGNORECASE): - response = "Hello! I am Marcus. How can I help you?" + response = "Hello! I am Sanad. How can I help you?" print(f"Marcus: {response}") add_to_history(cmd, response) log_cmd(cmd, response) @@ -346,10 +363,15 @@ def _handle_llava(cmd): t0 = time.time() img = get_frame() + # Poll up to 500 ms in 50 ms slices instead of blocking a full second. + # Returns the moment a frame is available — most drops recover in <100 ms. if img is None: print(" Waiting for camera...") - time.sleep(1.0) - img = get_frame() + for _ in range(10): + time.sleep(0.05) + img = get_frame() + if img is not None: + break if img is None: print(" Camera not ready — command cancelled") @@ -461,7 +483,7 @@ def run_terminal(): status = get_brain_status() print() print("=" * 48) - print(" MARCUS AI BRAIN — READY") + print(" SANAD AI BRAIN — READY") print("=" * 48) for k, v in status.items(): print(f" {k:<10}: {v}") diff --git a/Config/config_Brain.json b/Config/config_Brain.json index a7f208b..62b2cb5 100644 --- a/Config/config_Brain.json +++ b/Config/config_Brain.json @@ -3,13 +3,19 @@ "max_history": 6, "num_batch": 128, "num_ctx": 2048, - "num_predict_main": 200, + "subsystems": { + "lidar": true, + "voice": true, + "imgsearch": false, + "autonomous": true + }, + "num_predict_main": 120, "num_predict_goal": 80, "num_predict_patrol": 100, "num_predict_talk": 80, "num_predict_verify": 10, "warmup_num_predict": 5, - "main_prompt": "You are Marcus, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\" -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\" -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\" -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\" -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:", - "goal_prompt": "You are Marcus navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:", - "patrol_prompt": "You are Marcus, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:" + "main_prompt": "You are Sanad, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\" -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\" -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\" -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\" -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:", + "goal_prompt": "You are Sanad navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:", + "patrol_prompt": "You are Sanad, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:" } diff --git a/Config/config_Voice.json b/Config/config_Voice.json index 07c6d79..b5d9a3a 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -1,55 +1,46 @@ { "tts": { - "piper_voice_ar": "ar_JO-kareem-medium", - "piper_voice_en": "en_US-lessac-medium", - "piper_sample_rate": 22050, + "backend": "builtin_ttsmaker", "builtin_speaker_id": 0, - "target_sample_rate": 16000, - "piper_timeout_sec": 120, - "en_backend": "edge_tts", - "ar_backend": "edge_tts", - "edge_voice_ar": "ar-AE-HamdanNeural", - "edge_voice_en": "en-US-GuyNeural" + "target_sample_rate": 16000 }, "stt": { "wake_model": "tiny", "command_model": "small", - "wake_words_en": [ - "marcus", - "marcos", - "markus" - ], - "wake_words_ar": [ - "ماركوس", - "مارکوس", - "ماركس" - ], - "language": null, + "wake_words_en": ["sanad", "sannad", "sanat", "sunnat"], + "language": "en", "command_timeout_sec": 10, "silence_threshold": 500, "silence_duration_sec": 1.5, "max_record_sec": 15 }, "mic": { + "backend": "builtin_udp", "source_index": "3", "format": "s16le", "rate": 16000, "channels": 1 }, + "mic_udp": { + "group": "239.168.123.161", + "port": 5555, + "buffer_max_bytes": 64000, + "read_timeout_sec": 0.04 + }, "speaker": { "dds_interface": "eth0", "volume": 100, - "app_name": "marcus" + "app_name": "sanad" }, "audio": { "data_dir": "Data/Voice/Recordings", "log_file": "logs/voice.log" }, "messages": { - "wake_heard": "Listening...", - "no_speech": "I didn't catch that. Say my name again.", + "wake_heard": "Listening", + "no_speech": "I didn't catch that, please say it again", "error_tts": "Speech synthesis failed", "error_mic": "Microphone error", "ready": "Voice system ready" } -} \ No newline at end of file +} diff --git a/Config/marcus_prompts.yaml b/Config/marcus_prompts.yaml index 1ea43c0..1460de2 100644 --- a/Config/marcus_prompts.yaml +++ b/Config/marcus_prompts.yaml @@ -26,7 +26,7 @@ # ══════════════════════════════════════════════════════════════════════════════ main_prompt: | - You are Marcus — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai. + You are Sanad — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai. You are physically present in the room. You have a body, arms, and a camera. You follow commands from your operator and respond intelligently. {facts} @@ -184,7 +184,7 @@ main_prompt: | → {{"actions":[],"arm":null,"speak":"Nice to meet you Kassam, I will remember that","abort":null}} "who are you" - → {{"actions":[],"arm":null,"speak":"I am Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}} + → {{"actions":[],"arm":null,"speak":"I am Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}} Safety: "walk into the wall" @@ -307,7 +307,7 @@ main_prompt: | # ══════════════════════════════════════════════════════════════════════════════ goal_prompt: | - You are Marcus, a humanoid robot actively navigating toward a specific target. + You are Sanad, a humanoid robot actively navigating toward a specific target. YOUR MISSION: "{goal}" @@ -392,7 +392,7 @@ goal_prompt: | # ══════════════════════════════════════════════════════════════════════════════ patrol_prompt: | - You are Marcus, a humanoid robot autonomously exploring and mapping an office environment. + You are Sanad, a humanoid robot autonomously exploring and mapping an office environment. Your mission: move through the space intelligently, identify areas and objects, and build a spatial understanding of the layout. @@ -463,7 +463,7 @@ patrol_prompt: | # ══════════════════════════════════════════════════════════════════════════════ talk_prompt: | - You are Marcus, a humanoid robot assistant. You have been asked a question + You are Sanad, a humanoid robot assistant. You have been asked a question or given information. Do NOT move — just respond intelligently. {facts} @@ -509,7 +509,7 @@ talk_prompt: | → {{"actions":[],"arm":null,"speak":"All systems are functioning normally and I am ready to assist you","abort":null}} "what is your name" - → {{"actions":[],"arm":null,"speak":"My name is Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}} + → {{"actions":[],"arm":null,"speak":"My name is Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}} "who built you" → {{"actions":[],"arm":null,"speak":"I was built by YS Lootah Technology in Dubai, and my programmer is Kassam","abort":null}} diff --git a/Core/Logger.py b/Core/log_backend.py similarity index 100% rename from Core/Logger.py rename to Core/log_backend.py diff --git a/Core/logger.py b/Core/logger.py index cceda33..03eca31 100644 --- a/Core/logger.py +++ b/Core/logger.py @@ -1,9 +1,13 @@ """ -logger.py — Project-wide logging via Logger.py +logger.py — Project-wide configured logging instance. + +Imports the `Logs` backend class from log_backend.py (formerly Logger.py; +renamed to avoid a case-only filename collision with this module, which +breaks any case-insensitive filesystem — macOS default HFS+/APFS, Windows). """ import os from Core.env_loader import PROJECT_ROOT -from Core.Logger import Logs +from Core.log_backend import Logs # Single shared instance — all modules use this _logs = Logs(main_log_file=os.path.join(PROJECT_ROOT, "logs", "main.log")) diff --git a/Doc/MARCUS_API.md b/Doc/MARCUS_API.md index 8c40865..94b40bf 100644 --- a/Doc/MARCUS_API.md +++ b/Doc/MARCUS_API.md @@ -1,8 +1,37 @@ # Marcus — Full API & Developer Reference **Project:** Marcus | YS Lootah Technology | Jetson Orin NX + G1 EDU -**Scripts:** `~/Models_marcus/marcus_llava.py` + `~/Models_marcus/marcus_yolo.py` -**Updated:** April 4, 2026 +**Robot persona:** Sanad (wake word + self-intro; project code stays under `Marcus/`) +**Entry points:** `run_marcus.py` (terminal) / `Server/marcus_server.py` (WebSocket) +**Updated:** 2026-04-21 + +> **What changed since the early draft (April 4):** The project was restructured +> from two monolithic scripts (`marcus_llava.py` + `marcus_yolo.py`) into a +> layered architecture. See `Doc/architecture.md` for the current file tree and +> `Doc/environment.md` for the verified Jetson software stack, exact library +> versions, and GPU bring-up recipe. This reference still describes the +> function-level semantics (inputs/outputs/examples) — treat any file path in +> this document as illustrative and cross-check the actual module. Recent +> deltas called out inline below. + +### Recent API deltas (2026-04-21) + +| Change | Location | Note | +|---|---|---| +| GPU is mandatory for YOLO | `Config/config_Vision.json`, `Vision/marcus_yolo.py` | `yolo_device` defaults to `"cuda"` and is enforced; `_resolve_device()` raises `RuntimeError` on missing CUDA. `yolo_half=true` runs FP16 on Orin (capability 8.7). | +| Ollama model | `Config/config_Brain.json` | Default `ollama_model` is `qwen2.5vl:3b` (not `llava:7b`). | +| Ollama compute-graph caps | `Config/config_Brain.json` | `num_batch=128`, `num_ctx=2048` — required on 16 GB Orin NX to prevent the llama runner OOM. Propagated by `API/llava_api.py` and `Vision/marcus_imgsearch.py` to every `ollama.chat` call. | +| `num_predict_main` lowered | `Config/config_Brain.json` | 200 → 120 (shaves ~400–600 ms per open-ended command; JSON still parses). | +| ZMQ bind moved out of import | `API/zmq_api.py` | `init_zmq()` must be called from the main process before any `send_vel/send_cmd`. `init_brain()` does this. Children spawned via `multiprocessing` no longer collide on port 5556. | +| Camera-retry poll | `Brain/marcus_brain.py::_handle_llava` | Replaced `time.sleep(1.0)` with 10×50 ms polls. | +| Conditional scan sleeps | `Navigation/goal_nav.py`, `Autonomous/marcus_autonomous.py` | Removed unconditional per-step naps when real work (YOLO hit, LLaVA call, forward move) already consumed wall time. | +| Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. | +| Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. | +| Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. | +| Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. | +| Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. | +| Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. | +| Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. | --- @@ -22,38 +51,54 @@ 12. [JSON Schema Reference](#12-json-schema-reference) 13. [Environment & Paths](#13-environment--paths) 14. [Quick Reference Card](#14-quick-reference-card) +15. [Voice API (mic + TTS + STT)](#15-voice-api-mic--tts--stt) --- ## 1. Configuration Variables -Defined at the top of `marcus_llava.py`. Edit here to change global behavior. +All configuration is now **JSON-driven** and lives under `Config/`. Each module +loads its config at startup via `Core.config_loader.load_config(name)`. -| Variable | Default | Description | -|----------|---------|-------------| -| `ZMQ_HOST` | `"127.0.0.1"` | Holosoma ZMQ host | -| `ZMQ_PORT` | `5556` | Holosoma ZMQ port | -| `ZMQ_YOLO_PORT` | `5557` | YOLO ZMQ port (standalone mode) | -| `OLLAMA_MODEL` | `"llava:7b"` | LLaVA model via Ollama | -| `CAM_WIDTH` | `424` | Camera capture width (px) | -| `CAM_HEIGHT` | `240` | Camera capture height (px) | -| `CAM_FPS` | `15` | Camera frame rate | -| `CAM_QUALITY` | `70` | JPEG quality sent to LLaVA | -| `STOP_ITERATIONS` | `20` | gradual_stop message count | -| `STOP_DELAY` | `0.05` | seconds between stop messages | -| `STEP_PAUSE` | `0.3` | pause between consecutive action steps | -| `ARM_SDK_PATH` | `/home/unitree/unitree_sdk2_python` | Arm SDK path | -| `ARM_INTERFACE` | `"eth0"` | Network interface for arm SDK | +**`Config/config_ZMQ.json`** (Holosoma bridge) -Defined at top of `marcus_yolo.py`: +| Key | Default | Description | +|---|---|---| +| `zmq_host` | `"127.0.0.1"` | Holosoma ZMQ host | +| `zmq_port` | `5556` | Holosoma ZMQ port | +| `stop_iterations` | `20` | `gradual_stop()` message count | +| `stop_delay` | `0.05` | seconds between stop messages | +| `step_pause` | `0.3` | pause between consecutive action steps | -| Variable | Default | Description | -|----------|---------|-------------| -| `YOLO_MODEL_PATH` | `.../Model/yolov8m.pt` | YOLO model path | -| `YOLO_CONFIDENCE` | `0.45` | Minimum detection confidence | -| `YOLO_IOU` | `0.45` | NMS IOU threshold | -| `YOLO_DEVICE` | `"cpu"` | Inference device ("cpu" or "cuda") | -| `YOLO_IMG_SIZE` | `320` | Inference image size (smaller = faster) | +**`Config/config_Brain.json`** (Ollama VL model) + +| Key | Default | Description | +|---|---|---| +| `ollama_model` | `"qwen2.5vl:3b"` | Ollama model tag | +| `max_history` | `6` | conversation turns retained | +| `num_batch` | `128` | llama.cpp batch — **cap, required for Jetson** | +| `num_ctx` | `2048` | llama.cpp KV context length — **cap, required for Jetson** | +| `num_predict_main` | `120` | max tokens for the main command path | +| `num_predict_goal` | `80` | goal-navigation call | +| `num_predict_patrol` | `100` | autonomous patrol call | +| `num_predict_talk` | `80` | talk-only path | +| `num_predict_verify` | `10` | YOLO condition verifier (`yes`/`no`) | + +**`Config/config_Vision.json`** (YOLO) + +| Key | Default | Description | +|---|---|---| +| `yolo_model_path` | `"Models/yolov8m.pt"` | weights file (auto-fetched if missing) | +| `yolo_confidence` | `0.45` | detection confidence threshold | +| `yolo_iou` | `0.45` | NMS IOU threshold | +| `yolo_device` | `"cuda"` | **GPU required** — `"cpu"` raises `RuntimeError` | +| `yolo_half` | `true` | FP16 inference (Ampere tensor cores) | +| `yolo_img_size` | `320` | inference image size | +| `tracked_classes` | 19 COCO classes | filter for relevant detections | + +**`Config/config_Camera.json`**: `424x240 @ 15 fps`, `JPEG quality 70`. +**`Config/config_Voice.json`**: see section 6 below. +**`Config/config_Network.json`**: Jetson eth0/wlan0 IPs, WebSocket port. --- @@ -61,20 +106,28 @@ Defined at top of `marcus_yolo.py`: ### Setup +The bind is no longer an import-time side effect. It runs inside `init_zmq()`, called once by `init_brain()` from the main process. Children (e.g. the LiDAR SLAM worker spawned via `multiprocessing.spawn`) can re-import `API.zmq_api` without rebinding. + ```python -ctx = zmq.Context() -sock = ctx.socket(zmq.PUB) -sock.bind("tcp://127.0.0.1:5556") -time.sleep(0.5) +# API/zmq_api.py — bind happens here, not at module import +def init_zmq() -> zmq.Socket: + global ctx, sock + if sock is not None: + return sock # idempotent + ctx = zmq.Context() + sock = ctx.socket(zmq.PUB) + sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}") + time.sleep(0.5) # let SUBs attach + return sock ``` ### `send_vel(vx, vy, vyaw)` -Send velocity command to Holosoma. +Send velocity command to Holosoma. Raises `RuntimeError` if `init_zmq()` wasn't called. ```python def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0): - sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}})) + _ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}})) ``` | Parameter | Unit | Safe range | Effect | @@ -661,14 +714,17 @@ from unitree_sdk2py.g1.arm.g1_arm_action_client import G1ArmActionClient # Arm ``` STARTUP: - Tab 1: source ~/.holosoma_deps/miniconda3/bin/activate hsinference - cd ~/holosoma && sudo jetson_clocks + Tab 1 (hsinference env): Holosoma locomotion policy python3 run_policy.py inference:g1-29dof-loco \ --task.velocity-input zmq --task.state-input zmq --task.interface eth0 - Tab 2: ollama serve & - /home/unitree/miniconda3/envs/marcus/bin/python3 ~/Models_marcus/marcus_llava.py - (YOLO starts automatically — no Tab 3 needed) + Tab 2: ollama serve > /tmp/ollama.log 2>&1 & + sleep 3 + + Tab 3 (marcus env): conda activate marcus && cd ~/Marcus && python3 run_marcus.py + (YOLO + voice + LiDAR all start automatically per subsystems flags) + +WAKE WORD: "Sanad" COMMANDS: walk forward · turn right · turn left · move back @@ -704,4 +760,74 @@ SAFETY: --- +## 15. Voice API (mic + TTS + STT) + +New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack. + +### Mic — `Voice.builtin_mic.BuiltinMic` + +Captures the G1's on-board array microphone over UDP multicast. No USB mic required. 16 kHz mono int16 PCM natively; no resampling needed. + +```python +from Voice.builtin_mic import BuiltinMic +mic = BuiltinMic(group="239.168.123.161", port=5555, buf_max=64_000) +mic.start() +try: + pcm = mic.read_chunk(1024) # 512 samples, ~32 ms, int16 mono + # or + pcm = mic.read_seconds(3.0) +finally: + mic.stop() +``` + +Config under `config_Voice.json::mic_udp`. + +### TTS — `Voice.builtin_tts.BuiltinTTS` + +Wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker`. English only — refuses non-ASCII input. + +```python +from Voice.builtin_tts import BuiltinTTS +tts = BuiltinTTS(audio_client, default_speaker_id=0) +tts.speak("Hello, I am Sanad", block=True) # synth + play on G1 body speaker +``` + +Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly. + +### Wake + command loop — `Voice.marcus_voice.VoiceModule` + +Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands. + +```python +from API.audio_api import AudioAPI +from Voice.marcus_voice import VoiceModule + +def on_command(text, lang): + print(f"heard: {text}") + +audio = AudioAPI() +voice = VoiceModule(audio, on_command=on_command) +voice.start() # background thread +# ... later ... +voice.stop() +``` + +Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`. + +### AudioAPI — `API.audio_api.AudioAPI` + +Orchestration layer. Owns the `AudioClient`, manages mute/unmute, exposes a clean `speak` + `record` API. + +```python +from API.audio_api import AudioAPI +audio = AudioAPI() +audio.speak("Hello") # English only; non-ASCII returns early +pcm = audio.record(seconds=5) # int16 mono 16 kHz — uses BuiltinMic +audio.play_pcm(pcm) # raw PCM playback via Unitree RPC +``` + +Config: `config_Voice.json::tts.backend = "builtin_ttsmaker"`, `mic.backend = "builtin_udp"` (or `"pactl_parec"` to fall back to Hollyland). + +--- + *Marcus — YS Lootah Technology | Kassam | April 2026* diff --git a/Doc/MARCUS_progress.pdf b/Doc/MARCUS_progress.pdf deleted file mode 100644 index b495b60..0000000 Binary files a/Doc/MARCUS_progress.pdf and /dev/null differ diff --git a/Doc/Marcus_Project.pdf b/Doc/Marcus_Project.pdf deleted file mode 100644 index bf6a656..0000000 Binary files a/Doc/Marcus_Project.pdf and /dev/null differ diff --git a/Doc/architecture.md b/Doc/architecture.md index e2258ab..b3dc54d 100644 --- a/Doc/architecture.md +++ b/Doc/architecture.md @@ -1,20 +1,39 @@ # Marcus — System Architecture **Project**: Marcus | YS Lootah Technology -**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX (16GB) -**Updated**: 2026-04-06 +**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB +**Robot persona**: **Sanad** (wake word + self-intro; project code still lives under `Marcus/`) +**Updated**: 2026-04-21 + +--- + +## Recent deltas (since 2026-04-06) + +- **GPU-only YOLO** — `_resolve_device()` raises `RuntimeError` if CUDA is missing. `yolo_device=cuda`, `yolo_half=true` by default. +- **Ollama compute-graph caps** — `num_batch=128`, `num_ctx=2048` in `config_Brain.json` (otherwise llama.cpp OOMs on the 16 GB Jetson). +- **`num_predict_main: 120`** (was 200) — saves ~400-600 ms per open-ended command. +- **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import. +- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic. +- **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed. +- **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`). +- **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages. +- **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps. +- **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows. +- **Robot persona = "Sanad"** — wake words, prompts, banner, and self-intro all use "Sanad". Project identity ("Marcus") remains in file names, class names, directory, logs. + +See `Doc/environment.md` for the verified Jetson software stack and `Doc/pipeline.md` for the end-to-end data flow. --- ## Overview -Marcus is a fully offline humanoid robot AI system. The brain runs on Jetson Orin NX with no cloud dependencies. It uses vision-language models (Qwen2.5-VL via Ollama) for understanding commands, YOLO for real-time object detection, dead reckoning for position tracking, and persistent memory across sessions. +Marcus is a mostly-offline humanoid robot AI system. The brain runs on Jetson Orin NX using a local vision-language model (Qwen2.5-VL via Ollama) for open-ended commands, YOLOv8m for real-time object detection (CUDA + FP16), dead reckoning + optional ROS2 odometry for pose, Livox Mid-360 LiDAR + a custom SLAM worker for mapping, and persistent memory across sessions. Two operating modes: -- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson -- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients +- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson. Voice subsystem runs alongside by default. +- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients. -Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control. +Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control. Voice, LiDAR, image-search and autonomous-patrol are gated behind `config_Brain.json::subsystems` flags. --- @@ -28,14 +47,14 @@ Marcus/ ├── Core/ # Foundation layer — no external deps │ ├── env_loader.py # Reads .env, resolves PROJECT_ROOT │ ├── config_loader.py # load_config(name) → reads Config/config_{name}.json -│ ├── Logger.py # Logging engine (file-based, no console output) +│ ├── log_backend.py # Logging engine (file-based, no console output) — was Logger.py │ └── logger.py # Project wrapper: log(), log_and_print(), get_logger() │ ├── Config/ # ALL configuration — one JSON per module │ ├── config_ZMQ.json # ZMQ host, port, stop params │ ├── config_Camera.json # RealSense resolution, fps, quality -│ ├── config_Brain.json # Ollama model, prompts, num_predict values -│ ├── config_Vision.json # YOLO model path, confidence, tracked classes +│ ├── config_Brain.json # Ollama model, prompts, num_predict, num_batch/ctx, subsystems +│ ├── config_Vision.json # YOLO model path, device=cuda, half=true, confidence, tracked classes │ ├── config_Navigation.json # move_map, goal aliases, YOLO goal classes │ ├── config_Patrol.json # patrol duration, proximity threshold │ ├── config_Arm.json # arm actions, aliases, availability flag @@ -43,17 +62,26 @@ Marcus/ │ ├── config_Memory.json # session/places paths │ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports │ ├── config_ImageSearch.json # search defaults -│ └── marcus_prompts.yaml # All LLaVA/Qwen prompts (main, goal, patrol, talk, verify) +│ ├── config_Voice.json # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port +│ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params +│ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify) │ ├── API/ # Interface layer — one file per subsystem -│ ├── zmq_api.py # ZMQ PUB socket: send_vel(), gradual_stop(), send_cmd() +│ ├── zmq_api.py # ZMQ PUB socket: init_zmq(), send_vel(), gradual_stop(), send_cmd() │ ├── camera_api.py # RealSense thread: start/stop_camera(), get_frame() -│ ├── llava_api.py # LLaVA queries: call_llava(), ask(), ask_goal(), ask_patrol() -│ ├── yolo_api.py # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()... +│ ├── llava_api.py # Qwen2.5-VL queries via Ollama: call_llava(), ask(), ask_goal()… +│ ├── yolo_api.py # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()… │ ├── odometry_api.py # Odometry wrapper: init_odometry(), get_position() │ ├── memory_api.py # Memory wrapper: init_memory(), log_cmd(), place_save/goto() -│ ├── arm_api.py # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES -│ └── imgsearch_api.py # Image search wrapper: init_imgsearch(), get_searcher() +│ ├── arm_api.py # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES (stub) +│ ├── imgsearch_api.py # Image search wrapper: init_imgsearch(), get_searcher() +│ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic +│ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status() +│ +├── Voice/ # Mic + TTS + wake-word STT +│ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555 +│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id) +│ └── marcus_voice.py # VoiceModule — Whisper tiny (wake) + small (command) state machine │ ├── Brain/ # Decision logic — imports ONLY from API/ │ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal() @@ -127,39 +155,40 @@ Marcus/ │ Server/marcus_server.py (WebSocket) │ └──────────────────┬──────────────────────────────┘ │ -┌──────────────────▼──────────────────────────────┐ -│ Brain Layer │ -│ marcus_brain.py — init_brain() │ -│ — process_command(cmd) │ -│ command_parser.py — 14 regex local commands │ -│ executor.py — execute LLaVA decisions │ -│ marcus_memory.py — session + place memory │ -└──────────────────┬──────────────────────────────┘ +┌──────────────────▼──────────────────────────────────┐ +│ Brain Layer │ +│ marcus_brain.py — init_brain() / process_command │ +│ command_parser.py — regex-table local commands │ +│ executor.py — execute Qwen-VL decisions │ +│ marcus_memory.py — session + place memory │ +└──────────────────┬──────────────────────────────────┘ │ imports only from API/ -┌──────────────────▼──────────────────────────────┐ -│ API Layer │ -│ zmq_api camera_api llava_api │ -│ yolo_api odometry_api memory_api │ -│ arm_api imgsearch_api │ -└──────────────────┬──────────────────────────────┘ - │ wraps -┌──────────────────▼──────────────────────────────┐ -│ Navigation / Vision │ -│ goal_nav.py marcus_yolo.py │ -│ patrol.py marcus_imgsearch.py │ -│ marcus_odometry.py │ -└──────────────────┬──────────────────────────────┘ - │ -┌──────────────────▼──────────────────────────────┐ -│ Core Layer │ -│ env_loader.py config_loader.py │ -│ Logger.py logger.py │ -└──────────────────┬──────────────────────────────┘ +┌──────────────────▼──────────────────────────────────┐ +│ API Layer │ +│ zmq_api camera_api llava_api audio_api │ +│ yolo_api odometry_api memory_api imgsearch_api │ +│ arm_api lidar_api │ +└──────────────┬───────────────────────┬──────────────┘ + │ wraps │ wraps +┌──────────────▼───────────┐ ┌────────▼────────────────┐ +│ Navigation / Vision │ │ Voice │ +│ goal_nav.py │ │ builtin_mic.py │ +│ patrol.py │ │ builtin_tts.py │ +│ marcus_odometry.py │ │ marcus_voice.py │ +│ marcus_yolo.py │ │ (Whisper + TtsMaker) │ +│ marcus_imgsearch.py │ └──────────┬──────────────┘ +└──────────────┬───────────┘ │ + │ │ +┌──────────────▼─────────────────────────▼────────────┐ +│ Core Layer │ +│ env_loader.py config_loader.py │ +│ log_backend.py logger.py │ +└──────────────────┬──────────────────────────────────┘ │ reads -┌──────────────────▼──────────────────────────────┐ -│ Config / .env │ -│ 11 JSON files + marcus_prompts.yaml │ -└─────────────────────────────────────────────────┘ +┌──────────────────▼──────────────────────────────────┐ +│ Config / .env │ +│ 13 JSON files + marcus_prompts.yaml │ +└──────────────────────────────────────────────────────┘ ``` **Rule**: Brain never imports from Vision/ or Navigation/ directly. It goes through the API layer. @@ -176,11 +205,11 @@ Reads `.env` from the project root to resolve `PROJECT_ROOT`. Uses a minimal bui #### `config_loader.py` (30 lines) `load_config(name)` reads `Config/config_{name}.json` and caches the result. All modules call this instead of hardcoding constants. Also provides `config_path(relative)` to resolve relative paths (e.g., `"Models/yolov8m.pt"`) to absolute paths from PROJECT_ROOT. -#### `Logger.py` (186 lines) -Full logging engine from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery. +#### `log_backend.py` (186 lines, was `Logger.py`) +Full logging engine ported from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery. Renamed from `Logger.py` on 2026-04-21 to eliminate a case-only collision with `logger.py` that prevented the repo from cloning on case-insensitive filesystems (macOS/Windows). #### `logger.py` (51 lines) -Project wrapper around `Logger.py`. Provides: +Project wrapper around `log_backend.Logs`. Provides: - `log(message, level, module)` — write to `logs/{module}.log` - `log_and_print(message, level, module)` — write + print - `get_logger(module)` — get configured Logs instance @@ -191,12 +220,15 @@ Project wrapper around `Logger.py`. Provides: Each API file wraps one subsystem. They read their own config via `load_config()`, handle import errors gracefully with fallback stubs, and export clean public functions. -#### `zmq_api.py` (49 lines) -Creates a ZMQ PUB socket on startup (binds to `tcp://127.0.0.1:{zmq_port}`). Holosoma's RL policy connects to this socket as SUB and receives velocity commands at 50Hz. +#### `zmq_api.py` (~75 lines) +Holds the ZMQ PUB socket used to drive Holosoma at 50 Hz. **The bind is not a module import side effect any more** — it runs only when `init_zmq()` is called from the main (parent) process. This lets the LiDAR SLAM worker (spawned via `multiprocessing.spawn`) re-import the module without rebinding port 5556 and crashing. **Exports:** +- `init_zmq()` — idempotent bind, called once by `init_brain()` - `send_vel(vx, vy, vyaw)` — send velocity to Holosoma - `gradual_stop()` — 20 zero-velocity messages over 1 second +- `send_cmd(cmd)` — Holosoma state machine (`start` / `walk` / `stand` / `stop`) +- `get_socket()` — access the bound socket (for odometry to reuse) - `send_cmd(cmd)` — send state command: "start", "walk", "stand", "stop" - `get_socket()` — return the shared PUB socket (for odometry to reuse) - `MOVE_MAP` — direction-to-velocity lookup: `{"forward": (0.3, 0, 0), "left": (0, 0, 0.3), ...}` @@ -440,6 +472,37 @@ Supports text-only search (no reference image) using hint description. --- +### Voice/ + +Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable). + +#### `builtin_mic.py` (~180 lines, new 2026-04-21) +Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer. + +**Exports:** +- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent) +- `start()` / `stop()` — socket lifecycle +- `read_chunk(n)` — pull exactly `n` bytes (blocks up to `read_timeout`, pads silence otherwise) +- `read_seconds(s)` — convenience for "record `s` seconds" +- `flush()` — drop buffered audio (called while TTS plays, to avoid echo) + +#### `builtin_tts.py` (~70 lines, new 2026-04-21) +Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone). + +**Exports:** +- `BuiltinTTS(audio_client, default_speaker_id=0)` — init +- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker + +#### `marcus_voice.py` (~340 lines, rewired 2026-04-21) +Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()` → `BuiltinTTS`. + +**Exports:** +- `VoiceModule(audio_api, on_command=cb)` — init +- `start()` — spawn background thread +- `stop()` — graceful teardown + +--- + ### Server/ #### `marcus_server.py` (224 lines) diff --git a/Doc/controlling.md b/Doc/controlling.md index a94a181..d645d03 100644 --- a/Doc/controlling.md +++ b/Doc/controlling.md @@ -1,15 +1,16 @@ # Marcus — Control & Startup Guide -**Updated**: 2026-04-06 +**Robot persona:** Sanad (wake word + self-intro; project code lives under `Marcus/`) +**Updated**: 2026-04-21 --- ## Quick Start -### Prerequisites (Jetson Orin NX) +### Prerequisites (Jetson Orin NX, JetPack 5.1.1) ```bash -# Terminal 1 — Start Holosoma (locomotion policy) +# Terminal 1 — Start Holosoma (locomotion policy, in hsinference env) source ~/.holosoma_deps/miniconda3/bin/activate hsinference cd ~/holosoma ~/.holosoma_deps/miniconda3/envs/hsinference/bin/python3 \ @@ -19,28 +20,46 @@ cd ~/holosoma --task.velocity-input zmq \ --task.state-input zmq \ --task.interface eth0 + +# Terminal 2 — Ollama server (leave running) +ollama serve > /tmp/ollama.log 2>&1 & +sleep 3 +ollama list # confirm qwen2.5vl:3b present ``` ### Option A — Terminal Mode (on Jetson) ```bash -# Terminal 2 — Start Marcus Brain -conda activate Marcus -ollama serve & sleep 3 +# Terminal 3 — Start Marcus Brain +conda activate marcus cd ~/Marcus python3 run_marcus.py ``` -Direct keyboard control. All commands typed locally. +Direct keyboard control + voice input (say **"Sanad"** to wake). Expected banner on boot: + +``` +================================================ + SANAD AI BRAIN — READY +================================================ + model : qwen2.5vl:3b + yolo : True + odometry : True + memory : True + lidar : True + voice : True + camera : 424x240@15 +``` ### Option B — Server + Client (remote) ```bash -# Terminal 2 (Jetson) — Start Server +# Terminal 3 (Jetson) — Start Server +conda activate marcus cd ~/Marcus python3 -m Server.marcus_server -# Terminal 3 (Workstation) — Connect Client +# Terminal 4 (Workstation) — Connect Client cd ~/Robotics_workspace/yslootahtech/Project/Marcus python3 -m Client.marcus_cli ``` @@ -58,6 +77,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765` --- +## Voice + +- **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`) +- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed. +- **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally. +- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only. +- **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command. + +Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker. + +To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster. + +--- + ## Command Reference ### Movement @@ -75,17 +108,17 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765` ### Vision | Command | Action | |---------|--------| -| `what do you see` | LLaVA describes camera view | -| `describe the room` | LLaVA scene description | -| `is anyone here` | LLaVA person check | +| `what do you see` | Qwen2.5-VL describes camera view | +| `describe the room` | Qwen2.5-VL scene description | +| `is anyone here` | Qwen2.5-VL person check | | `yolo` | Show YOLO detection status | ### Goal Navigation | Command | Action | |---------|--------| | `goal/ stop when you see a person` | YOLO fast search + stop | -| `goal/ find a laptop` | YOLO + LLaVA search | -| `goal/ stop when you see a guy holding a phone` | YOLO + LLaVA compound verification | +| `goal/ find a laptop` | YOLO + Qwen-VL search | +| `goal/ stop when you see a guy holding a phone` | YOLO + Qwen-VL compound verification | | `find a person` | Auto-detected as goal (no prefix needed) | | `look for a bottle` | Auto-detected as goal | @@ -106,7 +139,7 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765` | `patrol` | Autonomous patrol (prompts for duration) | | `patrol: door → desk → exit` | Named waypoint patrol | -### Image Search +### Image Search (requires `subsystems.imgsearch: true`) | Command | Action | |---------|--------| | `search/ /path/to/photo.jpg` | Find target from reference image | @@ -122,11 +155,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765` | `last session` | Previous session summary | | `session summary` | Current session stats | +### Autonomous Mode +| Command | Action | +|---------|--------| +| `auto on` | Start autonomous exploration | +| `auto off` | Stop | +| `auto status` | Current step / observations | +| `auto save` | Snapshot observations to disk | + ### System | Command | Action | |---------|--------| | `help` | Command reference | | `example` | Usage examples | +| `lidar` / `lidar status` | SLAM engine pose + health | | `q` / `quit` | Shutdown | ### Client-Only Commands (CLI) @@ -139,35 +181,43 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765` --- +## Subsystem flags (`Config/config_Brain.json`) + +Control what initializes at boot. Defaults: + +```jsonc +"subsystems": { + "lidar": true, + "voice": true, + "imgsearch": false, + "autonomous": true +} +``` + +Set any to `false` to skip that subsystem's init. Boot time drops roughly: +- `voice: false` → ~2 s faster (no Whisper model load) +- `lidar: false` → ~1 s faster (no SLAM subprocess spawn) +- `imgsearch: false` → already the default; re-enable only when you need `search/ …` +- `autonomous: false` → minor, but removes the AutonomousMode init + +--- + ## Network Configuration | Interface | IP | Use | |-----------|-----|------| -| `eth0` | 192.168.123.164 | Robot internal network (Jetson - G1 - LiDAR) | -| `wlan0` | 10.255.254.86 | Office WiFi (Jetson - Workstation) | +| `eth0` | 192.168.123.164 | Robot internal network (Jetson ↔ G1 ↔ LiDAR) | +| `wlan0` | 10.255.254.86 | Office WiFi (Jetson ↔ Workstation) | | Service | Port | Protocol | |---------|------|----------| | Marcus WebSocket | 8765 | ws:// | -| ZMQ Velocity | 5556 | tcp:// (PUB/SUB) | -| Ollama API | 11434 | HTTP | -| LiDAR | 192.168.123.120 | Livox Mid360 | +| ZMQ velocity (→ Holosoma) | 5556 | tcp:// (PUB/SUB) | +| Ollama API | 11434 | HTTP (localhost only) | +| G1 audio multicast (mic) | 5555 | UDP multicast 239.168.123.161 | +| Livox Mid-360 (LiDAR) | 192.168.123.120 | UDP (Livox SDK) | -All configurable in `Config/config_Network.json`. - ---- - -## Subsystem Status - -On startup, the server/brain shows: -``` -YOLO : active (19 tracked classes, CPU, yolov8m.pt) -Odometry : active (dead reckoning, +/-10cm) -Memory : active (session_016_2026-04-06) -Camera : 424x240@15 (RealSense D435I) -LiDAR : ALIVE (Livox Mid360 at 192.168.123.120) -Arms : pending (GR00T N1.5 not yet integrated) -``` +Most values configurable in `Config/config_Network.json` and `config_Voice.json::mic_udp`. --- @@ -175,13 +225,15 @@ Arms : pending (GR00T N1.5 not yet integrated) | Issue | Cause | Fix | |-------|-------|-----| -| `ModuleNotFoundError: No module named 'Server'` | Wrong directory | `cd ~/Marcus` then run | -| Robot doesn't move | Holosoma not running | Start Holosoma first (Terminal 1) | -| Robot doesn't move | ZMQ port conflict | Only run one of Server or Brain, not both | -| `Camera: {e} reconnecting` | USB bandwidth | Reduce to `low` profile | -| LLaVA slow (>10s) | GPU VRAM full | Kill other GPU processes, or use `qwen2.5vl:3b` | -| `YOLO not available` | ultralytics not installed | `pip install ultralytics` | -| Client can't connect | Wrong IP or server not running | Check `status` command, verify IP | +| Banner shows `SANAD AI BRAIN — READY` but nothing moves | Holosoma not running | Start Holosoma (Terminal 1) first | +| `RuntimeError: CUDA not available` on boot | Wrong torch build on Jetson | See `Doc/environment.md` section 9.2 — reinstall the NVIDIA Jetson torch wheel | +| `llama runner process has terminated: %!w()` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` | +| Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only | +| `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast | +| Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` | +| Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" | +| `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` | +| Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up | --- @@ -191,6 +243,7 @@ Arms : pending (GR00T N1.5 not yet integrated) |------|------| | Brain code | `~/Marcus/Brain/` | | Server | `~/Marcus/Server/marcus_server.py` | +| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` | | Config | `~/Marcus/Config/` | | Prompts | `~/Marcus/Config/marcus_prompts.yaml` | | YOLO model | `~/Marcus/Models/yolov8m.pt` | @@ -199,3 +252,5 @@ Arms : pending (GR00T N1.5 not yet integrated) | Logs | `~/Marcus/logs/` | See `Doc/architecture.md` for full project structure and file-by-file documentation. +See `Doc/environment.md` for the verified Jetson software stack. +See `Doc/pipeline.md` for the end-to-end data flow. diff --git a/Doc/environment.md b/Doc/environment.md index 248b3dc..5a43f48 100644 --- a/Doc/environment.md +++ b/Doc/environment.md @@ -1,10 +1,11 @@ # Marcus — Environment & Version Reference **Project**: Marcus | YS Lootah Technology +**Robot persona**: Sanad (wake word + self-intro; codebase stays under `Marcus/`) **Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB **Deployment host**: `unitree@192.168.123.164` (hostname `ubuntu`) **Conda env**: `marcus` -**Captured**: 2026-04-12 +**Captured**: 2026-04-12 (updated 2026-04-21) This document is the canonical record of the verified GPU-accelerated software stack running on the Jetson Orin NX. It covers system software, Python environment, Marcus runtime dependencies, installation recipe, verification commands, and known quirks. Pair it with `architecture.md` (what the code does) and `controlling.md` (how to drive it). @@ -136,29 +137,23 @@ Captured from `importlib` on 2026-04-12, `marcus` env on the Jetson. ## 8. Marcus project modules — import status -All 16 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`: +All 25 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`: ``` -OK Core.config_loader -OK Core.env_loader -OK Vision.marcus_yolo -OK Vision.marcus_imgsearch -OK API.llava_api -OK API.yolo_api -OK API.camera_api -OK API.zmq_api -OK API.imgsearch_api -OK API.odometry_api -OK API.memory_api -OK API.arm_api -OK Navigation.goal_nav -OK Navigation.patrol -OK Navigation.marcus_odometry -OK Brain.marcus_brain -OK Brain.marcus_memory +OK Core.config_loader Core.env_loader +OK Core.log_backend Core.logger +OK Voice.builtin_mic Voice.builtin_tts Voice.marcus_voice +OK Vision.marcus_yolo Vision.marcus_imgsearch +OK API.llava_api API.yolo_api API.camera_api +OK API.zmq_api API.imgsearch_api API.odometry_api +OK API.memory_api API.arm_api API.audio_api +OK Navigation.goal_nav Navigation.patrol Navigation.marcus_odometry +OK Brain.marcus_brain Brain.marcus_memory Brain.command_parser OK Autonomous.marcus_autonomous ``` +Notable removals: `Voice/marcus_gemini_voice.py` deleted on 2026-04-21. `Core/Logger.py` renamed to `Core/log_backend.py`. + --- ## 9. Installation recipe (reproducing this environment) @@ -378,3 +373,7 @@ Config file (`Config/config_Vision.json`): | 2026-04-12 | Initial environment.md — full stack captured, GPU bring-up verified end to end. Steady-state YOLOv8m FPS on Orin NX measured at 21.9. Ollama Qwen2.5-VL verified at 100% GPU. | | 2026-04-12 | `Vision/marcus_yolo.py` rewired to load `config_Vision.json`, added `_resolve_device()` with hard-fail on missing CUDA (GPU-only policy). `API/yolo_api.py` updated to propagate `RuntimeError`. `Config/config_Vision.json` set `yolo_device=cuda`, `yolo_half=true`. | | 2026-04-12 | Installed NVIDIA Jetson torch `2.1.0a0+41361538.nv23.06` (replacing CPU-only PyPI `2.4.1`) + built torchvision `0.16.1` from source against it. Verified `nms device = cuda:0`. | +| 2026-04-12 | Fixed llama.cpp compute-graph OOM on Jetson: added `num_batch=128` + `num_ctx=2048` caps in `Config/config_Brain.json`, propagated through `API/llava_api.py` and `Vision/marcus_imgsearch.py`. Qwen2.5-VL compute graph drops from ~7.5 GiB to ~1.8 GiB. | +| 2026-04-21 | **Restructure**: moved ZMQ bind out of `API/zmq_api.py` import time into `init_zmq()`; fixes LiDAR SLAM worker spawn crash. Added loud GPU-requirement banner in `API/yolo_api.py`. Dropped `num_predict_main` 200→120. Made inner-loop sleeps in goal_nav/autonomous/imgsearch conditional. Renamed `Core/Logger.py` → `Core/log_backend.py` (case-collision fix). Updated `Doc/MARCUS_API.md` to current state. | +| 2026-04-21 | **Voice restructure**: added `Voice/builtin_mic.py` (G1 array mic via UDP multicast `239.168.123.161:5555`) and `Voice/builtin_tts.py` (thin `AudioClient.TtsMaker` wrapper). Rewired `Voice/marcus_voice.py` to use BuiltinMic. Refactored `API/audio_api.py::speak()` to use BuiltinTTS — removed ~110 lines of edge-tts + pydub + Piper plumbing. Deleted `Voice/marcus_gemini_voice.py`. Added `subsystems.{lidar,voice,imgsearch,autonomous}` gate in `config_Brain.json::init_brain()`. | +| 2026-04-21 | **Persona swap**: robot identifies as Sanad. Wake words `["sanad","sannad","sanat","sunnat"]`, `speaker.app_name="sanad"`, all Qwen prompts say "You are Sanad", banner reads `SANAD AI BRAIN — READY`, hardcoded self-intro says "I am Sanad". Project directory, class names, filenames, and `PROJECT_NAME=Marcus` env var unchanged. | diff --git a/Doc/note.txt b/Doc/note.txt index 6d7bf02..a1c3f6b 100644 --- a/Doc/note.txt +++ b/Doc/note.txt @@ -38,15 +38,12 @@ rm ~/Robotics_workspace/yslootahtech/Project/Marcus_fine_tune/marcus-gguf/marcus - -https://ingrid789.github.io/SkillMimic/ -https://github.com/wyhuai/SkillMimic - -https://vla-survey.github.io/ - - - - +https://github.com/AnjieCheng/NaVILA +https://rchalyang.github.io/EgoVLA/ +https://github.com/RchalYang/EgoVLA_Release +https://github.com/openvla/openvla +https://github.com/unitreerobotics/unifolm-vla +https://github.com/OpenDriveLab/WholebodyVLA diff --git a/Doc/pipeline.md b/Doc/pipeline.md new file mode 100644 index 0000000..338d4bc --- /dev/null +++ b/Doc/pipeline.md @@ -0,0 +1,187 @@ +# Marcus — End-to-End Pipeline + +**Robot persona:** Sanad (wake word + self-intro) +**Updated:** 2026-04-21 + +One map of every data path from sensor to motor, voice to speech. Cross-reference with `architecture.md` (what each file is) and `MARCUS_API.md` (function signatures). + +--- + +## Boot sequence + +`Brain/marcus_brain.py::init_brain()` — called once from `run_marcus.py` or `marcus_server.py`. + +``` +run_marcus.py + │ + ▼ +init_brain() + │ + ├─ init_zmq() PUB bind tcp://127.0.0.1:5556 → Holosoma + ├─ start_camera() RealSense 424×240@15fps → shared _raw_frame + ├─ init_yolo(raw_frame, raw_lock) YOLOv8m CUDA FP16, 19 classes — background thread + ├─ init_odometry() ROS2 /dog_odom → dead reckoning fallback + ├─ init_memory() loads Data/Brain/Sessions/session_NNN/ + │ + ├─ if subsystems.lidar: init_lidar() multiprocessing spawn SLAM_worker + ├─ if subsystems.imgsearch: init_imgsearch() (off by default) + ├─ if subsystems.autonomous: AutonomousMode() patrol state machine + │ + ├─ send_cmd("start") + 0.5s + send_cmd("walk") + 0.5s Holosoma handshake + │ + ├─ if subsystems.voice: _init_voice() ▼ voice pipeline below + └─ _warmup_llava() first Qwen2.5-VL inference + "SANAD AI BRAIN — READY" +``` + +Subsystem flags live in `config_Brain.json::subsystems`. Current defaults: + +```json +"subsystems": { "lidar": true, "voice": true, "imgsearch": false, "autonomous": true } +``` + +--- + +## Voice pipeline (when `subsystems.voice = true`) + +``` +G1 body mic (array) + └─ UDP multicast 239.168.123.161:5555 ── int16 mono 16 kHz PCM + ▼ +Voice/builtin_mic.py::BuiltinMic + ring buffer (64 KB) + read_chunk(n) + ▼ +Voice/marcus_voice.py::VoiceModule (IDLE → WAKE_HEARD → PROCESSING → SPEAKING) + ├─ IDLE : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…) + ├─ WAKE_HEARD : audio_api.speak("Listening") → G1 body speaker + ├─ PROCESSING : record-until-silence → Whisper small → transcribed text + └─ on_command(text, "en") + ▼ +Brain/marcus_brain.py::process_command(text) + ├─ regex fast-path → Brain/command_parser.py::try_local_command() + │ places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off + └─ else → _handle_llava(text) + ├─ get_frame() (10×50 ms poll, no 1 s stall) + ├─ API/llava_api.py::ask(text, img) + │ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120) + │ → parse_json() → {actions, arm, speak, abort} + └─ Brain/executor.py::execute(d) + ├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma + ├─ arm → API/arm_api.py (stub for now) + └─ abort → gradual_stop() + ▼ +result["speak"] → audio_api.speak(reply) + ▼ +API/audio_api.py::speak(text, lang="en") + ├─ mute mic (flush BuiltinMic buffer) + ├─ Voice/builtin_tts.py::BuiltinTTS.speak(text) + │ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only + │ time.sleep(len(text) * 0.08) + └─ unmute mic → back to IDLE +``` + +--- + +## Terminal / WebSocket command pipeline (same brain, skips voice) + +``` +run_marcus.py stdin OR Server/marcus_server.py WebSocket + ▼ +Brain/marcus_brain.py::process_command(text) + ▼ (same parser → LLaVA → executor → ZMQ as above) + ▼ +result dict → stdout OR WebSocket reply frame +``` + +--- + +## Vision pipeline (continuous, consumed by brain on demand) + +``` +RealSense D435 (USB) + └─ 424×240 BGR 15 fps + → API/camera_api.py — shared _raw_frame (thread-safe) + │ │ + │ └─ get_frame() → JPEG base64 on demand + ▼ + Vision/marcus_yolo.py (daemon thread) + YOLOv8m @ cuda:0 FP16 imgsz=320 + → _latest_detections (thread-safe list) + yolo_sees / yolo_closest / yolo_summary / yolo_fps + ▼ + Navigation/goal_nav.py (fast YOLO check → Qwen-VL fallback) + Autonomous/marcus_autonomous.py (patrol scan every N steps) + Brain/marcus_brain.py (status / alerts) +``` + +--- + +## Movement pipeline + +``` +Brain/executor.py OR Brain/command_parser.py OR Navigation/* + │ uses MOVE_MAP from config_Navigation.json + ▼ +API/zmq_api.py::send_vel(vx, vy, vyaw) JSON over ZMQ PUB (port 5556) + ▼ +Holosoma RL policy (separate process, hsinference env) + ▼ +G1 low-level joint commands over DDS/eth0 + ▼ +29-DOF body motion +``` + +--- + +## LiDAR pipeline (when `subsystems.lidar = true`) + +``` +Livox Mid-360 (192.168.123.120, UDP) + ▼ +Lidar/SLAM_worker.py (multiprocessing.spawn subprocess — CUDA-safe spawn) + ├─ SLAM_engine, SLAM_Filter, SLAM_LoopClosure, SLAM_Submap, SLAM_NavRuntime + ├─ publishes pose + obstacle flags back to parent via Queue + └─ writes occupancy grids to Data/Navigation/Maps/ + ▼ +API/lidar_api.py (reads the queues, exposes:) + ├─ obstacle_ahead() → bool + ├─ get_lidar_status() → dict (pose, loc_state, frame age, FPS, ICP ms) + └─ LIDAR_AVAILABLE + ▼ +Navigation/goal_nav.py rotation thread — pauses motion on obstacle_ahead() +Brain/command_parser.py — responds to "lidar status" queries +``` + +--- + +## Knobs that control each stage + +| Knob | Location | Effect | +|---|---|---| +| `subsystems.lidar` | config_Brain.json | SLAM subprocess on/off | +| `subsystems.voice` | config_Brain.json | BuiltinMic + Whisper + TtsMaker loop on/off | +| `subsystems.imgsearch` | config_Brain.json | image-guided search init on/off | +| `subsystems.autonomous` | config_Brain.json | auto-patrol state machine init on/off | +| `num_batch`, `num_ctx` | config_Brain.json | llama.cpp compute-graph size (128 / 2048 ≈ 1.8 GiB graph — **do not raise** on 16 GB Jetson) | +| `num_predict_main` | config_Brain.json | 120 tokens max for the main JSON reply | +| `yolo_device`, `yolo_half` | config_Vision.json | `cuda` / FP16 (hard-required; CPU not allowed) | +| `mic.backend` | config_Voice.json | `builtin_udp` (G1 array) or `pactl_parec` (Hollyland fallback) | +| `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast | +| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) | +| `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) | + +--- + +## Per-command latency (estimated, post-fixes) + +| Step | Typical | Notes | +|---|---|---| +| Wake-word detect | 200–500 ms | Whisper tiny on 2 s chunk | +| Record until silence | 1–8 s | depends on user speech | +| Whisper small STT | 500–1500 ms | once per command | +| Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall | +| Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` | +| Executor + ZMQ send | <10 ms | fire-and-forget PUB | +| TtsMaker playback | ~len(text) × 80 ms | synthesizes + plays on robot | + +**Total wake → answer-playback:** ~**2.5–4 s** for a short vision question like "what do you see" (vs. 5–8 s with the pre-restructure edge-tts/Gemini overhead). diff --git a/Navigation/goal_nav.py b/Navigation/goal_nav.py index 8341880..0952a90 100644 --- a/Navigation/goal_nav.py +++ b/Navigation/goal_nav.py @@ -123,26 +123,36 @@ def navigate_to_goal(goal: str, max_steps: int = 0): reached = False try: for step in range(1, max_steps + 1): - time.sleep(SCAN_INTERVAL) + # Track whether real work happened this iteration. If it did, + # the work itself already ate wall time — don't pay an extra + # SCAN_INTERVAL nap on top. + did_work = False # --- YOLO fast check --- if yolo_target and yolo_sees(yolo_target): img_b64 = get_frame() + did_work = True if condition: if not _verify_condition(yolo_target, condition, img_b64): print(f" [GoalNav] YOLO sees {yolo_target} but condition " f"'{condition}' not met — continuing") - continue - - print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}") - log_detection(yolo_target, position="goal", distance="close") - reached = True - break + # fall through to the sleep-skip path + else: + print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}") + log_detection(yolo_target, position="goal", distance="close") + reached = True + break + else: + print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}") + log_detection(yolo_target, position="goal", distance="close") + reached = True + break # --- LLaVA fallback (less frequent — every few steps) --- if step >= MIN_STEPS and step % MIN_STEPS == 0: img_b64 = get_frame() if img_b64: + did_work = True d = ask_goal(goal, img_b64) if d.get("reached"): print(f" [GoalNav] LLaVA says goal reached at step {step}") @@ -152,6 +162,11 @@ def navigate_to_goal(goal: str, max_steps: int = 0): if speak: print(f" [GoalNav] LLaVA: {speak}") + # Only pay the scan interval when nothing happened this step. + # If YOLO hit or LLaVA fired, they already took 50–1000 ms. + if not did_work: + time.sleep(SCAN_INTERVAL) + finally: rotating[0] = False rot_thread.join(timeout=1.0) diff --git a/Vision/marcus_imgsearch.py b/Vision/marcus_imgsearch.py index 9d51fa3..9b03908 100644 --- a/Vision/marcus_imgsearch.py +++ b/Vision/marcus_imgsearch.py @@ -59,7 +59,9 @@ except ImportError: # ══════════════════════════════════════════════════════════════════════════════ DEFAULT_MAX_STEPS = 60 # max rotation steps before giving up -STEP_DELAY = 0.4 # seconds between YOLO checks +STEP_DELAY = 0.15 # min gap between YOLO checks (was 0.4 — reduced + # because the rotation thread paces motion already + # and each LLaVA call is 600-1500 ms of real work) ROTATE_SPEED = 0.25 # rad/s rotation speed during search MIN_STEPS_WARMUP = 3 # skip first N steps (stale frame) MATCH_CONFIDENCE_THR = 0.6 # LLaVA confidence threshold (not used directly, diff --git a/Voice/builtin_mic.py b/Voice/builtin_mic.py new file mode 100644 index 0000000..a4a1c52 --- /dev/null +++ b/Voice/builtin_mic.py @@ -0,0 +1,202 @@ +""" +builtin_mic.py — G1 built-in microphone (UDP multicast capture) +================================================================ +The G1 humanoid's on-board microphone is published by the Unitree firmware +as an RTP-like UDP multicast stream on 239.168.123.161:5555, carrying +16 kHz mono int16 PCM. Any host on the robot's 192.168.123.0/24 network +can join the group and read the audio — no extra SDK call required. + +This module intentionally has no dependency on pyaudio, pulseaudio, or the +unitree_sdk2py package. Joining the multicast group is all that's needed. + +Usage: + from Voice.builtin_mic import BuiltinMic + mic = BuiltinMic() + mic.start() + try: + chunk = mic.read_chunk(1024) # 512 samples, 32 ms at 16 kHz + ... + finally: + mic.stop() + +Ported from Project/Sanad/voice/audio_io.py (Sanad's production implementation). +""" + +from __future__ import annotations + +import socket +import struct +import subprocess +import threading +import time +from typing import Optional + + +DEFAULT_GROUP = "239.168.123.161" +DEFAULT_PORT = 5555 +DEFAULT_BUF_MAX = 64_000 # ~2 s of 16 kHz mono int16 +DEFAULT_READ_TIMEOUT = 0.04 # 40 ms budget per read_chunk call +SAMPLE_RATE = 16_000 # hardware rate — do not change + + +def _find_g1_local_ip() -> str: + """ + Return the host IPv4 on the G1's internal 192.168.123.0/24 network. + Required by IP_ADD_MEMBERSHIP so the kernel knows which NIC to join on. + """ + out = subprocess.run( + ["ip", "-4", "-o", "addr"], capture_output=True, text=True, + ).stdout + for line in out.splitlines(): + for tok in line.split(): + if tok.startswith("192.168.123."): + return tok.split("/")[0] + raise RuntimeError( + "BuiltinMic: no interface on 192.168.123.0/24 — " + "host is not on the G1's internal network" + ) + + +class BuiltinMic: + """ + G1 on-board microphone over UDP multicast. + + Thread-safe: a background daemon thread receives datagrams into an + internal ring buffer; `read_chunk(n)` pulls the next `n` bytes or + blocks up to `read_timeout` before returning zeros. + """ + + sample_rate = SAMPLE_RATE + + def __init__( + self, + group: str = DEFAULT_GROUP, + port: int = DEFAULT_PORT, + buf_max: int = DEFAULT_BUF_MAX, + read_timeout: float = DEFAULT_READ_TIMEOUT, + ): + self._group = group + self._port = port + self._buf_max = buf_max + self._read_timeout = read_timeout + self._sock: Optional[socket.socket] = None + self._buf = bytearray() + self._lock = threading.Lock() + self._running = False + self._thread: Optional[threading.Thread] = None + + def start(self) -> None: + if self._running: + return + local_ip = _find_g1_local_ip() + self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self._sock.bind(("", self._port)) + mreq = struct.pack( + "4s4s", + socket.inet_aton(self._group), + socket.inet_aton(local_ip), + ) + self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq) + self._sock.settimeout(1.0) + self._running = True + self._thread = threading.Thread( + target=self._recv_loop, daemon=True, name="builtin_mic_rx", + ) + self._thread.start() + print(f" [BuiltinMic] joined {self._group}:{self._port} on {local_ip}") + + def _recv_loop(self) -> None: + while self._running: + try: + data, _ = self._sock.recvfrom(4096) + with self._lock: + self._buf.extend(data) + # ring-buffer: drop oldest when we'd exceed buf_max + if len(self._buf) > self._buf_max: + del self._buf[: len(self._buf) - self._buf_max] + except socket.timeout: + continue + except Exception: + if self._running: + time.sleep(0.01) + + def read_chunk(self, num_bytes: int) -> bytes: + """ + Return exactly `num_bytes` of 16 kHz mono int16 PCM. + + Waits up to `read_timeout` for that many bytes to be available. + If the buffer is still short after the timeout, returns whatever + is available padded with silence. Never blocks forever. + """ + deadline = time.time() + self._read_timeout + while time.time() < deadline: + with self._lock: + if len(self._buf) >= num_bytes: + chunk = bytes(self._buf[:num_bytes]) + del self._buf[:num_bytes] + return chunk + time.sleep(0.003) + with self._lock: + avail = len(self._buf) + if avail > 0: + chunk = bytes(self._buf[:avail]) + del self._buf[:avail] + return chunk + b"\x00" * (num_bytes - avail) + return b"\x00" * num_bytes + + def read_seconds(self, seconds: float) -> bytes: + """ + Convenience: capture `seconds` of audio and return as bytes. + Blocks for the full duration (not a real-time producer). + """ + num_bytes = int(seconds * self.sample_rate * 2) # 2 bytes/sample (int16) + out = bytearray() + chunk_bytes = 1024 + while len(out) < num_bytes: + out.extend(self.read_chunk(min(chunk_bytes, num_bytes - len(out)))) + return bytes(out) + + def flush(self) -> None: + """Drop all buffered audio (e.g. after the robot spoke).""" + with self._lock: + self._buf.clear() + + def stop(self) -> None: + self._running = False + if self._sock is not None: + try: + self._sock.close() + except Exception: + pass + self._sock = None + if self._thread is not None: + self._thread.join(timeout=1.5) + self._thread = None + + +# ──────────────────────────────────────────────────────────────── +# Standalone test — capture 3 s and print energy stats +# ──────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + import array + + print("BuiltinMic standalone test — capturing 3 s from G1...") + mic = BuiltinMic() + mic.start() + time.sleep(0.3) # let the receiver thread warm up + raw = mic.read_seconds(3.0) + mic.stop() + + samples = array.array("h", raw) + if not samples: + print(" FAIL — got zero samples") + else: + mn = min(samples); mx = max(samples) + mean_abs = sum(abs(s) for s in samples) / len(samples) + print(f" samples={len(samples)} min={mn} max={mx} mean|s|={mean_abs:.0f}") + if mean_abs > 30: + print(" OK — mic is capturing audio") + else: + print(" WARN — signal very low, check G1 audio service is running") diff --git a/Voice/builtin_tts.py b/Voice/builtin_tts.py new file mode 100644 index 0000000..32ab3d5 --- /dev/null +++ b/Voice/builtin_tts.py @@ -0,0 +1,88 @@ +""" +builtin_tts.py — Unitree G1 built-in TTS (English only) +======================================================== +Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board +TTS engine synthesizes and plays directly through the body speaker — no +internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side. + +Supported languages (firmware-side): + English — works (Marcus uses this) + Chinese — works (unused) + Arabic — silently falls back to Chinese (unusable — we refuse these) + +Signature: + client.TtsMaker(text: str, speaker_id: int) -> int # 0 = success + speaker_id ∈ {0, 1, 2} — different voice timbres + +Usage: + from Voice.builtin_tts import BuiltinTTS + tts = BuiltinTTS(audio_client) + tts.speak("Hello, I am Sanad", speaker_id=0) +""" + +from __future__ import annotations + +import logging +import time +from typing import Optional + +log = logging.getLogger("builtin_tts") + + +class BuiltinTTS: + """Synchronous English-only TTS via the G1's on-board engine.""" + + # Rough playback duration per character — enough margin that `speak()` + # returns after audio has actually finished on the robot. + SECONDS_PER_CHAR = 0.08 + MIN_SECONDS = 1.5 + + def __init__(self, audio_client, default_speaker_id: int = 0): + """ + Args: + audio_client : initialized unitree_sdk2py AudioClient + default_speaker_id : 0, 1, or 2 (default voice timbre) + """ + self._client = audio_client + self._default_speaker = default_speaker_id + + def speak( + self, + text: str, + speaker_id: Optional[int] = None, + block: bool = True, + ) -> int: + """ + Play `text` on the G1 speaker via TtsMaker. + + English-only by policy. Non-ASCII (Arabic) input is rejected rather + than silently played back as Chinese. Returns the TtsMaker status + code (0 = success) or -1 if input was rejected. + """ + if not text or not text.strip(): + return -1 + + # Reject non-English. TtsMaker "falls back" by playing Arabic text + # as Chinese phonemes — intelligible to nobody — so we refuse it + # rather than surprise the operator. + if any(ord(c) > 127 for c in text): + log.warning("builtin_tts refusing non-ASCII text: %r", text[:60]) + return -1 + + sid = self._default_speaker if speaker_id is None else speaker_id + log.info("[TtsMaker sid=%d] %s", sid, text[:80]) + + try: + code = self._client.TtsMaker(text, sid) + except Exception as e: + log.error("TtsMaker call failed: %s", e) + return -1 + + if block: + # Estimate how long the G1 is going to take to finish speaking. + # TtsMaker is fire-and-forget — we need to wait so the mic loop + # knows when to unmute. + duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR) + time.sleep(duration) + + return code diff --git a/Voice/marcus_gemini_voice.py b/Voice/marcus_gemini_voice.py deleted file mode 100644 index f02495a..0000000 --- a/Voice/marcus_gemini_voice.py +++ /dev/null @@ -1,608 +0,0 @@ -#!/usr/bin/env python3 -""" -Voice/marcus_gemini_voice.py — Marcus Gemini Live Voice Module v2 -================================================================== -Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio. -Uses G1 built-in speaker + Hollyland wireless mic. - -Based on SanadVoice/gemini_interact architecture: -- PyAudio for mic (not parec) -- Echo suppression (silence when speaking) -- Gemini VAD (automatic activity detection) -- thinkingBudget=0 (no thinking text) -- ASR buffering for full sentences -- Vision routed to brain's Qwen camera - -Usage: - from Voice.marcus_gemini_voice import GeminiVoiceModule - voice = GeminiVoiceModule(audio_api, on_transcript=callback) - voice.start() -""" - -import array -import asyncio -import base64 -import json -import logging -import os -import subprocess -import threading -import time -import numpy as np - -from dotenv import load_dotenv -load_dotenv() - -BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree") -PROJECT_NAME = "Marcus" -PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME) - -LOG_DIR = os.path.join(PROJECT_ROOT, "logs") -os.makedirs(LOG_DIR, exist_ok=True) - -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", - handlers=[ - logging.FileHandler(os.path.join(LOG_DIR, "voice.log")), - logging.StreamHandler(), - ], -) -log = logging.getLogger("gemini_voice") - - -def load_config(name: str) -> dict: - path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json") - with open(path, "r") as f: - return json.load(f) - - -# ─── CONFIGURATION ──────────────────────────────────────── - -API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8" -MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025" -URI = ( - "wss://generativelanguage.googleapis.com/ws/" - "google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent" - f"?key={API_KEY}" -) - -VOICE_NAME = "Charon" -SEND_RATE = 16000 -RECEIVE_RATE = 24000 -CHUNK_SIZE = 512 -CHANNELS = 1 - - -def load_system_prompt(): - paths = [ - os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"), - ] - for p in paths: - if os.path.exists(p): - with open(p, "r", encoding="utf-8-sig") as f: - return f.read().strip() - return ( - "You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. " - "Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max." - ) - - -# ─── AUDIO HELPERS ──────────────────────────────────────── - -def audio_energy(pcm: bytes) -> int: - try: - samples = array.array("h", pcm) - if not samples: - return 0 - return sum(abs(s) for s in samples) // len(samples) - except Exception: - return 0 - - -SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2) - - -# ─── GEMINI VOICE MODULE ───────────────────────────────── - -class GeminiVoiceModule: - """Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic.""" - - def __init__(self, audio_api, on_transcript=None): - self._audio = audio_api - self._on_transcript = on_transcript - self._config = load_config("Voice") - self._mic_source = getattr(audio_api, '_mic_source', - self._config["mic"].get("source_index", "0")) - - # State - self.speaking = False - self.interrupted = False - self._running = False - self._thread = None - self._audio_queue = None # Created in async context - - # Tuning - self.MIN_THRESHOLD = 3000 - self.barge_in_threshold = self.MIN_THRESHOLD - self.REQUIRED_LOUD_CHUNKS = 10 - self.PREBUFFER_CHUNKS = 2 - self.PLAYBACK_TIMEOUT = 0.25 - self.BARGE_IN_COOLDOWN = 0.7 - self.AI_SPEAK_GRACE = 0.20 - self.ECHO_GUARD_SEC = 0.8 - self.SPEAKING_ENERGY_GATE = 0.85 - self.SEND_SILENCE_WHEN_SPEAKING = True - - # Timing - self._ai_speaking_since = 0.0 - self._last_ai_audio_time = 0.0 - self._barge_in_block_until = 0.0 - self._ignore_input_until = 0.0 - - # ASR buffer - self._asr_buf = "" - self._asr_last_time = 0.0 - self.ASR_WINDOW_SEC = 2.0 - - # Find Hollyland mic PyAudio device index - self._mic_device_idx = self._find_mic_device() - - log.info("GeminiVoiceModule v2 initialized") - - # ─── MIC DEVICE DETECTION ───────────────────────────── - - def _find_mic_device(self) -> int: - """Find Hollyland wireless mic in PyAudio devices. Returns device index.""" - import pyaudio - import ctypes - ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p) - def _alsa_error_handler(filename, line, function, err, fmt): - pass # suppress - c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler) - try: - asound = ctypes.cdll.LoadLibrary("libasound.so.2") - asound.snd_lib_error_set_handler(c_error_handler) - except: pass # ALSA_suppress - pa = pyaudio.PyAudio() - try: - # First: set PulseAudio default source to Hollyland - subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True) - subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True) - - # Search for wireless mic by name - for i in range(pa.get_device_count()): - info = pa.get_device_info_by_index(i) - name = info.get("name", "").lower() - if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name): - log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"])) - return i - - # Fallback to 'default' or 'pulse' device - for i in range(pa.get_device_count()): - info = pa.get_device_info_by_index(i) - if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"): - log.info("Mic fallback: [%d] %s", i, info["name"]) - return i - - log.warning("No mic found, using device 0") - return 0 - finally: - pa.terminate() - - # ─── MIC CALIBRATION ────────────────────────────────── - - def _calibrate_mic(self): - """Calibrate barge-in threshold from ambient noise.""" - import pyaudio - import ctypes - ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p) - def _alsa_error_handler(filename, line, function, err, fmt): - pass # suppress - c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler) - try: - asound = ctypes.cdll.LoadLibrary("libasound.so.2") - asound.snd_lib_error_set_handler(c_error_handler) - except: pass # ALSA_suppress - pa = pyaudio.PyAudio() - mic_info = pa.get_device_info_by_index(self._mic_device_idx) - mic_rate = int(mic_info["defaultSampleRate"]) - mic_channels = 1 - try: - stream = pa.open(format=pyaudio.paInt16, channels=mic_channels, - rate=mic_rate, input=True, - input_device_index=self._mic_device_idx, - frames_per_buffer=CHUNK_SIZE) - values = [] - for _ in range(40): - data = stream.read(CHUNK_SIZE, exception_on_overflow=False) - values.append(audio_energy(data)) - stream.stop_stream() - stream.close() - avg_noise = sum(values) / len(values) if values else 0 - self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0) - log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold) - except Exception as e: - log.warning("Calibration failed: %s", e) - finally: - pa.terminate() - - # ─── G1 SPEAKER PLAYBACK ───────────────────────────── - - def _play_buffer_on_g1(self, pcm_24k: np.ndarray): - """Play 24kHz audio on G1 speaker (resample to 16kHz, single call).""" - if len(pcm_24k) < 100: - return - - # Resample 24kHz → 16kHz - tl = int(len(pcm_24k) * 16000 / 24000) - audio_16k = np.interp( - np.linspace(0, len(pcm_24k), tl, endpoint=False), - np.arange(len(pcm_24k)), - pcm_24k.astype(np.float64), - ).astype(np.int16) - - from unitree_sdk2py.g1.audio.g1_audio_api import ( - ROBOT_API_ID_AUDIO_START_PLAY, - ROBOT_API_ID_AUDIO_STOP_PLAY, - ) - - client = self._audio._client - if not client: - return - - app_name = "gemini" - client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name})) - time.sleep(0.1) - - pcm = audio_16k.tobytes() - sid = f"s_{int(time.time() * 1000)}" - param = json.dumps({ - "app_name": app_name, - "stream_id": sid, - "sample_rate": 16000, - "channels": 1, - "bits_per_sample": 16, - }) - client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm)) - - duration = len(audio_16k) / 16000 - time.sleep(duration + 0.3) - client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name})) - - # ─── WEBSOCKET TASKS ───────────────────────────────── - - async def _capture_mic(self, ws): - """Continuously capture mic via PyAudio and send to Gemini.""" - import pyaudio - import ctypes - ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p) - def _alsa_error_handler(filename, line, function, err, fmt): - pass # suppress - c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler) - try: - asound = ctypes.cdll.LoadLibrary("libasound.so.2") - asound.snd_lib_error_set_handler(c_error_handler) - except: pass # ALSA_suppress - pa = pyaudio.PyAudio() - - mic_info = pa.get_device_info_by_index(self._mic_device_idx) - mic_rate = int(mic_info["defaultSampleRate"]) - mic_channels = 1 - - # Open mic at native rate/channels - stream = pa.open(format=pyaudio.paInt16, channels=mic_channels, - rate=mic_rate, input=True, - input_device_index=self._mic_device_idx, - frames_per_buffer=CHUNK_SIZE) - - log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels) - - loud_chunks = 0 - loop = asyncio.get_event_loop() - needs_resample = mic_rate != SEND_RATE or mic_channels != 1 - - try: - while self._running: - data = await loop.run_in_executor( - None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False)) - - # Convert to mono 16kHz if needed - if needs_resample: - audio = np.frombuffer(data, dtype=np.int16) - # Stereo to mono - if mic_channels == 2: - audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16) - # Resample to 16kHz - if mic_rate != SEND_RATE: - tl = int(len(audio) * SEND_RATE / mic_rate) - if tl > 0: - audio = np.interp( - np.linspace(0, len(audio), tl, endpoint=False), - np.arange(len(audio)), - audio.astype(np.float64), - ).astype(np.int16) - data = audio.tobytes() - - energy = audio_energy(data) - now = time.time() - - # Barge-in detection - if self.speaking and now >= self._barge_in_block_until: - if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE: - if energy > self.barge_in_threshold: - loud_chunks += 1 - else: - loud_chunks = 0 - if loud_chunks > self.REQUIRED_LOUD_CHUNKS: - log.info("Barge-in detected!") - self.interrupted = True - self.speaking = False - while not self._audio_queue.empty(): - try: self._audio_queue.get_nowait() - except: break - loud_chunks = 0 - self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN - - # Echo suppression: send silence while speaking - data_to_send = data - if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking: - gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE - if energy < gate: - data_to_send = SILENCE_PCM - - # Send to Gemini - b64 = base64.b64encode(data_to_send).decode() - msg = { - "realtime_input": { - "media_chunks": [ - {"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64} - ] - } - } - await ws.send(json.dumps(msg)) - - except Exception as e: - if self._running: - log.error("Mic error: %s", e) - finally: - stream.stop_stream() - stream.close() - pa.terminate() - - async def _receive_audio(self, ws): - """Receive audio responses and transcriptions from Gemini.""" - async for msg in ws: - if not self._running: - break - try: - response = json.loads(msg) - server_content = response.get("serverContent", {}) - - if server_content.get("interrupted"): - self.interrupted = False - - # User transcription (partial/streaming) - input_tr = ( - server_content.get("inputTranscription") - or server_content.get("input_transcription") - or server_content.get("inputAudioTranscription") - or server_content.get("input_audio_transcription") - ) - if isinstance(input_tr, dict): - text = (input_tr.get("text") or "").strip() - now = time.time() - if text and now >= self._ignore_input_until and not self.speaking: - # Buffer ASR text - if now - self._asr_last_time > self.ASR_WINDOW_SEC: - self._asr_buf = "" - self._asr_buf = text # Gemini sends cumulative transcription - self._asr_last_time = now - - if self.interrupted: - continue - - # Audio from Gemini - model_turn = server_content.get("modelTurn") - if model_turn: - for part in model_turn.get("parts", []): - inline_data = part.get("inlineData") - if inline_data: - audio_b64 = inline_data.get("data") - if audio_b64: - now = time.time() - if not self.speaking: - self._ai_speaking_since = now - # Gemini started responding — fire transcript callback - if self._asr_buf and self._on_transcript: - self._on_transcript(self._asr_buf, "user") - self.speaking = True - self._last_ai_audio_time = now - self._ignore_input_until = now + self.ECHO_GUARD_SEC - audio_bytes = base64.b64decode(audio_b64) - await self._audio_queue.put(audio_bytes) - - # Text from Gemini (thinking/response text) - text_part = part.get("text", "").strip() - if text_part and self._on_transcript: - self._on_transcript(text_part, "marcus") - - # Turn complete — Gemini finished speaking - turn_complete = server_content.get("turnComplete") - if turn_complete: - # Clear ASR buffer after turn - self._asr_buf = "" - - except Exception as e: - log.error("Receive error: %s", e) - - async def _play_audio(self): - """Collect Gemini audio chunks and play on G1 speaker.""" - while self._running: - try: - if not self.speaking: - await asyncio.sleep(0.05) - continue - - # Pre-buffer - buffered = False - while self.speaking and not buffered: - if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS: - buffered = True - else: - await asyncio.sleep(0.01) - - # Collect all audio chunks - buffer_chunks = [] - while self.speaking: - try: - data = await asyncio.wait_for( - self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT) - audio = np.frombuffer(data, dtype=np.int16) - buffer_chunks.append(audio) - self._last_ai_audio_time = time.time() - except asyncio.TimeoutError: - if self._audio_queue.empty(): - if time.time() - self._last_ai_audio_time > 0.3: - break - - # Play on G1 speaker - if buffer_chunks: - full_audio = np.concatenate(buffer_chunks) - duration = len(full_audio) / RECEIVE_RATE - log.info("Playing %.1fs on G1", duration) - - await asyncio.get_event_loop().run_in_executor( - None, self._play_buffer_on_g1, full_audio) - - self.speaking = False - - except Exception as e: - log.error("Play error: %s", e) - self.speaking = False - - # ─── MAIN LOOP ──────────────────────────────────────── - - async def _run_async(self): - import websockets - import inspect - - system_prompt = load_system_prompt() - - # Unmute mic - subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True) - subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True) - - # Calibrate - self._calibrate_mic() - - ws_kwargs = {"max_size": None} - try: - sig = inspect.signature(websockets.connect) - if "extra_headers" in sig.parameters: - ws_kwargs["extra_headers"] = {"Content-Type": "application/json"} - else: - ws_kwargs["additional_headers"] = {"Content-Type": "application/json"} - except Exception: - ws_kwargs["extra_headers"] = {"Content-Type": "application/json"} - - while self._running: - try: - log.info("Connecting to Gemini...") - async with websockets.connect(URI, **ws_kwargs) as ws: - setup_msg = { - "setup": { - "model": MODEL, - "generationConfig": { - "responseModalities": ["AUDIO"], - "thinkingConfig": {"thinkingBudget": 0}, - "speechConfig": { - "voiceConfig": { - "prebuiltVoiceConfig": {"voiceName": VOICE_NAME} - } - }, - }, - "realtimeInputConfig": { - "automaticActivityDetection": { - "startOfSpeechSensitivity": "START_SENSITIVITY_HIGH", - "prefixPaddingMs": 40, - "endOfSpeechSensitivity": "END_SENSITIVITY_HIGH", - "silenceDurationMs": 250, - } - }, - "inputAudioTranscription": {}, - "systemInstruction": {"parts": [{"text": system_prompt}]}, - } - } - await ws.send(json.dumps(setup_msg)) - await ws.recv() - log.info("Connected! Always listening...") - - self._audio_queue = asyncio.Queue() - - await asyncio.gather( - self._capture_mic(ws), - self._receive_audio(ws), - self._play_audio(), - ) - - except Exception as e: - if self._running: - log.error("Connection error: %s — reconnecting in 3s", e) - await asyncio.sleep(3) - - def _voice_thread(self): - asyncio.run(self._run_async()) - - # ─── START / STOP ───────────────────────────────────── - - def start(self): - if self._running: - return - self._running = True - self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice") - self._thread.start() - log.info("Gemini voice module started") - - def stop(self): - self._running = False - if self._thread: - self._thread.join(timeout=5) - self._thread = None - log.info("Gemini voice module stopped") - - @property - def is_running(self) -> bool: - return self._running - - @property - def state(self) -> str: - return "LISTENING" if self._running else "STOPPED" - - @property - def is_speaking(self) -> bool: - return self.speaking - - -# ─── STANDALONE TEST ───────────────────────────────────── - -if __name__ == "__main__": - import sys - sys.path.insert(0, PROJECT_ROOT) - from API.audio_api import AudioAPI - - def on_transcript(text, role): - print(f" [{role.upper()}] {text}") - - audio = AudioAPI() - voice = GeminiVoiceModule(audio, on_transcript=on_transcript) - - print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n") - voice.start() - - try: - while voice.is_running: - time.sleep(0.5) - except KeyboardInterrupt: - print("\nStopping...") - voice.stop() diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py index 1ccb56c..83124ca 100644 --- a/Voice/marcus_voice.py +++ b/Voice/marcus_voice.py @@ -1,19 +1,20 @@ #!/usr/bin/env python3 """ -Features/Voice/marcus_voice.py — Marcus Always-Listening Voice Module -====================================================================== +Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English) +======================================================================= State machine: IDLE → (wake word detected) → WAKE_HEARD WAKE_HEARD → (record command) → PROCESSING PROCESSING → (Whisper transcribe) → send to brain → SPEAKING SPEAKING → (TTS done) → IDLE -Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny) +Wake word: "Marcus" (detected by Whisper tiny) Commands: Transcribed by Whisper small -TTS: Handled by API/audio_api.py +Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py) +TTS: English only, Unitree built-in TtsMaker (API/audio_api.py) Usage: - from Features.Voice.marcus_voice import VoiceModule + from Voice.marcus_voice import VoiceModule voice = VoiceModule(audio_api, on_command=brain.handle_voice_command) voice.start() # background thread voice.stop() @@ -21,7 +22,6 @@ Usage: import logging import os -import subprocess import threading import time import numpy as np @@ -74,7 +74,8 @@ class VoiceModule: """ Args: audio_api: AudioAPI instance (from API/audio_api.py) - on_command: callback(text: str, lang: str) — called when command is transcribed + on_command: callback(text: str, lang: str) — "lang" is always "en" + now; kept in the signature for interface stability. """ self._audio = audio_api self._on_command = on_command @@ -83,13 +84,23 @@ class VoiceModule: self._stt = self._config["stt"] self._mic = self._config["mic"] - # Whisper models — lazy loaded + # Whisper models — lazy loaded on first _voice_loop() iteration self._wake_model = None self._cmd_model = None - # Wake words - self._wake_en = [w.lower() for w in self._stt["wake_words_en"]] - self._wake_ar = self._stt["wake_words_ar"] + # Wake words (English only — built-in TTS doesn't do Arabic) + self._wake_en = [w.lower() for w in self._stt.get("wake_words_en", + ["marcus", "marcos"])] + + # G1 built-in mic (UDP multicast). + from Voice.builtin_mic import BuiltinMic + _mcfg = self._config.get("mic_udp", {}) + self._mic_capture = BuiltinMic( + group=_mcfg.get("group", "239.168.123.161"), + port=_mcfg.get("port", 5555), + buf_max=_mcfg.get("buffer_max_bytes", 64000), + ) + self._sample_rate = self._mic_capture.sample_rate # 16000 # State self._state = State.IDLE @@ -97,7 +108,7 @@ class VoiceModule: self._thread = None self._lock = threading.Lock() - log.info("VoiceModule initialized") + log.info("VoiceModule initialized (mic: G1 built-in UDP)") # ─── MODEL LOADING ──────────────────────────────────── @@ -115,69 +126,49 @@ class VoiceModule: self._cmd_model = whisper.load_model(self._stt["command_model"]) log.info("Command model ready") - # ─── MIC RECORDING ──────────────────────────────────── + # ─── MIC RECORDING (G1 built-in UDP) ────────────────── def _record_chunk(self, seconds: float) -> np.ndarray: - """Record audio chunk from mic via parec.""" - source = self._mic["source_index"] - rate = str(self._mic["rate"]) - - proc = subprocess.Popen( - ["parec", "-d", source, - "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"], - stdout=subprocess.PIPE, - ) - time.sleep(seconds) - proc.terminate() - raw = proc.stdout.read() - return np.frombuffer(raw, dtype=np.int16) + """Capture a fixed-duration chunk from the G1 built-in mic.""" + num_bytes = int(seconds * self._sample_rate * 2) # int16 mono + raw = bytearray() + bite = 1024 + while len(raw) < num_bytes: + raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw)))) + return np.frombuffer(bytes(raw), dtype=np.int16) def _record_until_silence(self) -> np.ndarray: - """Record until silence is detected or max duration reached.""" - source = self._mic["source_index"] - rate = self._mic["rate"] - threshold = self._stt["silence_threshold"] - silence_dur = self._stt["silence_duration_sec"] - max_dur = self._stt["max_record_sec"] + """Capture until RMS drops below threshold for `silence_duration_sec`.""" + threshold = self._stt.get("silence_threshold", 500) + silence_dur = self._stt.get("silence_duration_sec", 1.5) + max_dur = self._stt.get("max_record_sec", 15) - chunk_sec = 0.5 - chunk_samples = int(rate * chunk_sec) - silence_chunks_needed = int(silence_dur / chunk_sec) - max_chunks = int(max_dur / chunk_sec) + chunk_sec = 0.5 + chunk_bytes = int(self._sample_rate * chunk_sec) * 2 + silence_chunks_need = int(silence_dur / chunk_sec) + max_chunks = int(max_dur / chunk_sec) - proc = subprocess.Popen( - ["parec", "-d", source, - "--format=s16le", f"--rate={rate}", "--channels=1", "--raw"], - stdout=subprocess.PIPE, - ) - - all_audio = [] + all_audio = [] silence_count = 0 - chunk_count = 0 + chunk_count = 0 - try: - while chunk_count < max_chunks: - data = proc.stdout.read(chunk_samples * 2) # 2 bytes per sample - if not data: - break + while chunk_count < max_chunks: + raw = self._mic_capture.read_chunk(chunk_bytes) + if not raw: + break + chunk = np.frombuffer(raw, dtype=np.int16) + all_audio.append(chunk) + chunk_count += 1 - chunk = np.frombuffer(data, dtype=np.int16) - all_audio.append(chunk) - chunk_count += 1 + rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2)) + if rms < threshold: + silence_count += 1 + else: + silence_count = 0 - # Check for silence - rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2)) - if rms < threshold: - silence_count += 1 - else: - silence_count = 0 - - if silence_count >= silence_chunks_needed and chunk_count > 2: - log.info("Silence detected after %.1fs", chunk_count * chunk_sec) - break - finally: - proc.terminate() - proc.stdout.read() # drain + if silence_count >= silence_chunks_need and chunk_count > 2: + log.info("Silence detected after %.1fs", chunk_count * chunk_sec) + break if all_audio: return np.concatenate(all_audio) @@ -205,38 +196,18 @@ class VoiceModule: return text def _check_wake_word(self, text: str) -> bool: - """Check if transcribed text contains a wake word.""" + """Check if transcribed text contains an English wake word.""" text_lower = text.lower().strip() - - # English wake words - for w in self._wake_en: - if w in text_lower: - return True - - # Arabic wake words - for w in self._wake_ar: - if w in text: - return True - - return False + return any(w in text_lower for w in self._wake_en) # ─── MAIN LOOP ──────────────────────────────────────── def _voice_loop(self): """Main voice processing loop — runs in background thread.""" self._load_whisper() + self._mic_capture.start() log.info("Voice loop started — listening for wake word...") - # Unmute mic once - subprocess.run( - ["pactl", "set-source-mute", self._mic["source_index"], "0"], - capture_output=True, - ) - subprocess.run( - ["pactl", "set-source-volume", self._mic["source_index"], "100%"], - capture_output=True, - ) - while self._running: try: if self._state == State.IDLE: @@ -279,9 +250,7 @@ class VoiceModule: self._state = State.WAKE_HEARD # Acknowledge - self._audio.speak( - self._config["messages"]["wake_heard"], "en" - ) + self._audio.speak(self._config["messages"]["wake_heard"]) def _do_wake_heard(self): """Record the command until silence.""" @@ -294,7 +263,7 @@ class VoiceModule: if len(audio) < 4000: # < 0.25s at 16kHz log.info("Too short, ignoring") - self._audio.speak(self._config["messages"]["no_speech"], "en") + self._audio.speak(self._config["messages"]["no_speech"]) self._state = State.IDLE return @@ -308,18 +277,16 @@ class VoiceModule: if not text or len(text.strip()) < 2: log.info("Empty transcription") - self._audio.speak(self._config["messages"]["no_speech"], "en") + self._audio.speak(self._config["messages"]["no_speech"]) self._state = State.IDLE return - # Detect language - lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en" - log.info("Command [%s]: %s", lang, text) + log.info("Command: %s", text) - # Send to brain callback + # Send to brain callback (lang always "en" in this build) if self._on_command: try: - self._on_command(text, lang) + self._on_command(text, "en") except Exception as e: log.error("Brain callback error: %s", e) @@ -342,6 +309,10 @@ class VoiceModule: def stop(self): """Stop voice listening.""" self._running = False + try: + self._mic_capture.stop() + except Exception: + pass if self._thread: self._thread.join(timeout=5) self._thread = None