Update 2026-04-21 16:10:00

This commit is contained in:
kassam 2026-04-21 16:10:03 +04:00
parent 8491be7f1e
commit e0f6acd5c7
24 changed files with 1291 additions and 1157 deletions

View File

@ -2,19 +2,24 @@
""" """
API/audio_api.py Marcus Audio API Layer API/audio_api.py Marcus Audio API Layer
========================================== ==========================================
Provides speak() and record_audio() for the Brain layer. Provides speak() and record() for the Brain layer.
Brain imports ONLY from this API never from unitree SDK directly. Brain imports ONLY from this API never from unitree SDK directly.
Speaker: _CallRequestWithParamAndBin (single call, full buffer) Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only,
Mic: parec -d 3 (Hollyland wireless, PulseAudio source index from config) no MP3/WAV plumbing, no internet). Optional raw-PCM playback path
TTS EN: Unitree built-in TtsMaker via _play_pcm() is kept for future modules that synthesize their
TTS AR: Piper ar_JO-kareem-medium resample G1 speaker own audio (e.g. offline Piper).
Mic: G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono).
Legacy Hollyland/parec path retained as fallback when
config_Voice.json has mic.backend="pactl_parec".
TTS: English only. Arabic is rejected (the G1 firmware silently maps
Arabic to Chinese, which confuses everyone if Arabic TTS is ever
needed again, use a separate offline backend like Piper).
Usage: Usage:
from API.audio_api import AudioAPI from API.audio_api import AudioAPI
audio = AudioAPI() audio = AudioAPI()
audio.speak("Hello", "en") audio.speak("Hello, I am Sanad")
audio.speak("مرحبا", "ar")
recording = audio.record(seconds=5) recording = audio.record(seconds=5)
audio.play_pcm(recording) audio.play_pcm(recording)
""" """
@ -71,7 +76,24 @@ class AudioAPI:
self._tts = self._config["tts"] self._tts = self._config["tts"]
self._mic = self._config["mic"] self._mic = self._config["mic"]
self._spk = self._config["speaker"] self._spk = self._config["speaker"]
self._target_rate = self._tts["target_sample_rate"] self._target_rate = self._tts.get("target_sample_rate", 16000)
# Default mic backend: G1 built-in UDP multicast.
# Set mic.backend="pactl_parec" in config_Voice.json to fall back
# to the legacy Hollyland/PulseAudio path.
self._mic_backend = self._mic.get("backend", "builtin_udp")
self._builtin_mic = None # lazy-initialized on first record()
# Built-in TTS wrapper (uses the already-initialized AudioClient).
# Keeps TTS synchronous so `is_speaking` is meaningful to the voice
# loop that needs to skip mic input during playback.
self._tts_engine = None
if self._sdk_available:
from Voice.builtin_tts import BuiltinTTS
self._tts_engine = BuiltinTTS(
self._client,
default_speaker_id=self._tts.get("builtin_speaker_id", 0),
)
# Data dir # Data dir
data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"]) data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"])
@ -82,7 +104,10 @@ class AudioAPI:
self._speaking = False self._speaking = False
self._speak_lock = threading.Lock() self._speak_lock = threading.Lock()
log.info(self._config["messages"]["ready"]) log.info("%s (mic=%s, tts=%s)",
self._config["messages"]["ready"],
self._mic_backend,
"builtin_ttsmaker" if self._tts_engine else "disabled")
def _init_sdk(self): def _init_sdk(self):
"""Initialize Unitree AudioClient.""" """Initialize Unitree AudioClient."""
@ -105,55 +130,63 @@ class AudioAPI:
# ─── SPEAK ──────────────────────────────────────────── # ─── SPEAK ────────────────────────────────────────────
def speak(self, text: str, lang: str = "auto"): def speak(self, text: str, lang: str = "en"):
""" """
Speak text in the given language. Speak `text` in English through the G1 built-in TTS (TtsMaker).
Mutes mic during playback to prevent self-listening.
lang="en" built-in TtsMaker
lang="ar" Piper resample G1 speaker
lang="auto" detect from text
"""
if lang == "auto":
lang = self._detect_lang(text)
log.info("[%s] speak: %s", lang.upper(), text[:80]) Mutes (flushes) the mic during playback so the voice loop doesn't
hear the robot's own voice and transcribe itself. The `lang`
argument is accepted for API compatibility but only "en" plays
non-ASCII text (Arabic) is rejected by BuiltinTTS.
"""
if lang and lang not in ("en", "auto"):
log.warning("builtin_tts only supports English; got lang=%r — skipping", lang)
return
if self._tts_engine is None:
log.error("No TTS engine initialized — audio SDK unavailable")
return
log.info("speak: %s", text[:80])
with self._speak_lock: with self._speak_lock:
self._speaking = True self._speaking = True
self._mute_mic() self._mute_mic()
try: try:
if lang == "en": self._tts_engine.speak(text, block=True)
self._speak_english(text)
elif lang == "ar":
self._speak_arabic(text)
else:
log.warning("Unknown lang '%s', falling back to English", lang)
self._speak_english(text)
except Exception as e: except Exception as e:
log.error("%s: %s", self._config["messages"]["error_tts"], e) log.error("%s: %s", self._config["messages"]["error_tts"], e)
finally: finally:
# Small delay so speaker fully stops before mic reopens # Small tail so the speaker fully finishes before the mic is
time.sleep(0.3) # re-opened for capture
time.sleep(0.2)
self._unmute_mic() self._unmute_mic()
self._speaking = False self._speaking = False
def _mute_mic(self): def _mute_mic(self):
"""Mute the wireless mic to prevent self-listening.""" """
Suppress mic input during TTS playback.
For the UDP built-in mic, flush the buffer so we don't capture any
echo that's already been queued. For the legacy PulseAudio path,
actually mute the source.
"""
if self._mic_backend == "builtin_udp":
if self._builtin_mic is not None:
self._builtin_mic.flush()
return
source = self._mic["source_index"] source = self._mic["source_index"]
subprocess.run( subprocess.run(["pactl", "set-source-mute", source, "1"],
["pactl", "set-source-mute", source, "1"], capture_output=True)
capture_output=True,
)
log.debug("Mic muted") log.debug("Mic muted")
def _unmute_mic(self): def _unmute_mic(self):
"""Unmute the wireless mic.""" """Re-enable mic after TTS playback (pactl path only)."""
if self._mic_backend == "builtin_udp":
if self._builtin_mic is not None:
self._builtin_mic.flush()
return
source = self._mic["source_index"] source = self._mic["source_index"]
subprocess.run( subprocess.run(["pactl", "set-source-mute", source, "0"],
["pactl", "set-source-mute", source, "0"], capture_output=True)
capture_output=True,
)
log.debug("Mic unmuted") log.debug("Mic unmuted")
@property @property
@ -161,88 +194,8 @@ class AudioAPI:
"""True while TTS is playing — voice module checks this.""" """True while TTS is playing — voice module checks this."""
return self._speaking return self._speaking
def _speak_english(self, text: str):
"""English TTS via edge-tts."""
self._speak_edge_tts(text, "en")
def _speak_arabic(self, text: str):
"""Arabic TTS via edge-tts."""
self._speak_edge_tts(text, "ar")
def speak_piper_en(self, text: str):
"""Alternative: English via Piper instead of built-in."""
voice = self._tts["piper_voice_en"]
audio, rate = self._piper_synthesize(text, voice)
audio_16k = self._resample(audio, rate)
self._play_pcm(audio_16k)
# ─── PIPER TTS ────────────────────────────────────────
def _piper_synthesize(self, text: str, voice: str) -> tuple:
"""Run Piper CLI, return (audio_int16, sample_rate)."""
cmd = ["piper", "--model", voice, "--output_raw"]
timeout = self._tts["piper_timeout_sec"]
proc = subprocess.run(
cmd,
input=text.encode("utf-8"),
capture_output=True,
timeout=timeout,
)
if proc.returncode != 0:
stderr = proc.stderr.decode()[:300]
raise RuntimeError(f"Piper failed: {stderr}")
audio = np.frombuffer(proc.stdout, dtype=np.int16)
piper_rate = self._tts["piper_sample_rate"]
log.info("Piper: %d samples @ %dHz (%.1fs)", len(audio), piper_rate, len(audio) / piper_rate)
return audio, piper_rate
# ─── RESAMPLE ─────────────────────────────────────────
def _speak_edge_tts(self, text: str, lang: str):
"""Generate speech via edge-tts and play on G1."""
import os as _os
voice = "ar-AE-HamdanNeural" if lang == "ar" else "en-US-GuyNeural"
ts = int(time.time() * 1000)
mp3_path = f"/tmp/edge_{lang}_{ts}.mp3"
wav_path = f"/tmp/edge_{lang}_{ts}.wav"
safe_text = text.replace('"', '\\"')
code = f'import edge_tts, asyncio; asyncio.run(edge_tts.Communicate(\"{safe_text}\", voice=\"{voice}\").save(\"{mp3_path}\"))'
result = subprocess.run(["python3", "-c", code], capture_output=True, text=True, timeout=30)
if result.returncode != 0:
log.error("edge-tts failed: %s", result.stderr[:200])
if lang == "en" and self._sdk_available:
self._client.TtsMaker(text, self._tts.get("builtin_speaker_id", 1))
time.sleep(max(2.0, len(text) * 0.06))
return
try:
from pydub import AudioSegment
a = AudioSegment.from_mp3(mp3_path)
a = a.set_frame_rate(16000).set_channels(1).set_sample_width(2)
a.export(wav_path, format="wav")
import wave
with wave.open(wav_path, "rb") as wf:
audio = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)
_os.unlink(mp3_path)
_os.unlink(wav_path)
self._play_pcm(audio)
except Exception as e:
log.error("edge-tts conversion error: %s", e)
try: _os.unlink(mp3_path)
except: pass
try: _os.unlink(wav_path)
except: pass
def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray: def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray:
"""Resample to target rate (16kHz).""" """Linear resample int16 PCM to self._target_rate (16 kHz)."""
if src_rate == self._target_rate: if src_rate == self._target_rate:
return audio return audio
tl = int(len(audio) * self._target_rate / src_rate) tl = int(len(audio) * self._target_rate / src_rate)
@ -252,7 +205,7 @@ class AudioAPI:
audio.astype(np.float64), audio.astype(np.float64),
).astype(np.int16) ).astype(np.int16)
# ─── G1 SPEAKER PLAYBACK ───────────────────────────── # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
def _play_pcm(self, audio_16k: np.ndarray) -> float: def _play_pcm(self, audio_16k: np.ndarray) -> float:
"""Play 16kHz mono int16 on G1 speaker. Returns duration.""" """Play 16kHz mono int16 on G1 speaker. Returns duration."""
@ -308,24 +261,50 @@ class AudioAPI:
# ─── MIC RECORDING ─────────────────────────────────── # ─── MIC RECORDING ───────────────────────────────────
def record(self, seconds: float = 5.0) -> np.ndarray: def record(self, seconds: float = 5.0) -> np.ndarray:
"""Record from Hollyland wireless mic via parec. Returns int16 array.""" """
Capture `seconds` of int16 mono 16 kHz PCM.
Default backend is the G1 built-in mic (UDP multicast). Set
mic.backend="pactl_parec" in config_Voice.json to use the
legacy Hollyland/parec path instead.
"""
if self._mic_backend == "builtin_udp":
return self._record_builtin(seconds)
return self._record_parec(seconds)
def _record_builtin(self, seconds: float) -> np.ndarray:
"""Built-in mic path — join UDP multicast, read the requested duration."""
if self._builtin_mic is None:
from Voice.builtin_mic import BuiltinMic
mcfg = self._config.get("mic_udp", {})
self._builtin_mic = BuiltinMic(
group=mcfg.get("group", "239.168.123.161"),
port=mcfg.get("port", 5555),
buf_max=mcfg.get("buffer_max_bytes", 64000),
)
self._builtin_mic.start()
time.sleep(0.2) # let the receiver thread fill in
log.info("Recording %.1fs from G1 built-in mic", seconds)
raw = self._builtin_mic.read_seconds(seconds)
audio = np.frombuffer(raw, dtype=np.int16)
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
if audio.std() < 50:
log.warning(self._config["messages"]["error_mic"] +
" — G1 mic silent (check audio service on robot)")
return audio
def _record_parec(self, seconds: float) -> np.ndarray:
"""Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'."""
source = self._mic["source_index"] source = self._mic["source_index"]
rate = str(self._mic["rate"]) rate = str(self._mic["rate"])
channels = str(self._mic["channels"]) channels = str(self._mic["channels"])
fmt = self._mic["format"] fmt = self._mic["format"]
# Unmute mic subprocess.run(["pactl", "set-source-mute", source, "0"], capture_output=True)
subprocess.run( subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True)
["pactl", "set-source-mute", source, "0"],
capture_output=True,
)
subprocess.run(
["pactl", "set-source-volume", source, "100%"],
capture_output=True,
)
log.info("Recording %.1fs from mic source %s", seconds, source)
log.info("Recording %.1fs from mic source %s (parec)", seconds, source)
proc = subprocess.Popen( proc = subprocess.Popen(
["parec", "-d", source, ["parec", "-d", source,
f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"], f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"],
@ -337,10 +316,8 @@ class AudioAPI:
audio = np.frombuffer(raw, dtype=np.int16) audio = np.frombuffer(raw, dtype=np.int16)
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std()) log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
if audio.std() < 50: if audio.std() < 50:
log.warning(self._config["messages"]["error_mic"] + " — mic may be silent") log.warning(self._config["messages"]["error_mic"] + " — mic may be silent")
return audio return audio
def save_recording(self, audio: np.ndarray, name: str) -> str: def save_recording(self, audio: np.ndarray, name: str) -> str:
@ -355,16 +332,6 @@ class AudioAPI:
log.info("Saved: %s", path) log.info("Saved: %s", path)
return path return path
# ─── LANGUAGE DETECTION ───────────────────────────────
@staticmethod
def _detect_lang(text: str) -> str:
"""Detect language from text — Arabic Unicode range check."""
for c in text:
if '\u0600' <= c <= '\u06FF':
return "ar"
return "en"
# ─── STATUS ─────────────────────────────────────────── # ─── STATUS ───────────────────────────────────────────
@property @property
@ -378,27 +345,16 @@ if __name__ == "__main__":
import argparse import argparse
parser = argparse.ArgumentParser(description="Marcus Audio API Test") parser = argparse.ArgumentParser(description="Marcus Audio API Test")
parser.add_argument("--test", action="store_true", help="Run speak tests") parser.add_argument("--test", action="store_true", help="Run TTS + record test")
parser.add_argument("--speak", type=str, help="Speak this text") parser.add_argument("--speak", type=str, help="Speak this English text")
parser.add_argument("--lang", default="auto", help="Language: en, ar, auto")
parser.add_argument("--record", type=float, default=0, help="Record N seconds") parser.add_argument("--record", type=float, default=0, help="Record N seconds")
args = parser.parse_args() args = parser.parse_args()
api = AudioAPI() api = AudioAPI()
if args.test: if args.test:
print("\n--- English built-in ---") print("\n--- English (TtsMaker) ---")
api.speak("Hello, I am Marcus.", "en") api.speak("Hello, I am Sanad.")
time.sleep(1)
print("\n--- Arabic Piper ---")
api.speak("مرحبا، أنا ماركوس", "ar")
time.sleep(1)
print("\n--- Auto-detect ---")
api.speak("How are you?")
time.sleep(1)
api.speak("كيف حالك؟")
time.sleep(1) time.sleep(1)
print("\n--- Record 3s + playback ---") print("\n--- Record 3s + playback ---")
@ -408,7 +364,7 @@ if __name__ == "__main__":
print("\nDone.") print("\nDone.")
elif args.speak: elif args.speak:
api.speak(args.speak, args.lang) api.speak(args.speak)
elif args.record > 0: elif args.record > 0:
rec = api.record(args.record) rec = api.record(args.record)

View File

@ -49,9 +49,28 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
print(f"marcus_yolo.py not found ({e})") print(f"marcus_yolo.py not found ({e})")
return False return False
# GPU is required — let RuntimeError from _resolve_device propagate so # GPU is required. _resolve_device() raises RuntimeError when CUDA is
# Marcus hard-fails at startup instead of silently running without vision. # missing — surface that with an actionable banner before re-raising so
ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock) # Marcus hard-fails with a clear error instead of a raw stack trace.
try:
ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock)
except RuntimeError as e:
print()
print("" + "" * 68 + "")
print("║ MARCUS STARTUP ABORTED — GPU REQUIRED".ljust(69) + "")
print("" + "" * 68 + "")
print(f"{str(e)[:66]:<66}")
print("" + " " * 68 + "")
print("║ On the Jetson, verify:".ljust(69) + "")
print("║ tegrastats # GPU exists & is not throttled".ljust(69) + "")
print("║ python3 -c 'import torch; print(torch.cuda.is_available())'".ljust(69) + "")
print("║ nvcc --version # CUDA toolkit reachable".ljust(69) + "")
print("║ Expected: torch 2.1.0 nv23.06, CUDA 11.4, GPU=Orin.".ljust(69) + "")
print("║ See Doc/environment.md section 9 for the reinstall recipe.".ljust(69) + "")
print("" + "" * 68 + "")
print()
raise
if ok: if ok:
YOLO_AVAILABLE = True YOLO_AVAILABLE = True
yolo_sees = _ys yolo_sees = _ys

View File

@ -1,7 +1,16 @@
""" """
zmq_api.py ZMQ velocity + command interface to Holosoma zmq_api.py ZMQ velocity + command interface to Holosoma
Previously the PUB socket was bound at module import time. That made the
module unsafe to re-import from any multiprocessing child (e.g. the LiDAR
SLAM_worker spawn), because the child would try to rebind the same port
and crash with `Address already in use`.
The bind now lives in init_zmq() call it once from the brain entrypoint.
Child processes can import this module without any network side effects.
""" """
import json import json
import os
import time import time
import zmq import zmq
from Core.config_loader import load_config from Core.config_loader import load_config
@ -15,35 +24,62 @@ STOP_ITERATIONS = _cfg["stop_iterations"]
STOP_DELAY = _cfg["stop_delay"] STOP_DELAY = _cfg["stop_delay"]
STEP_PAUSE = _cfg["step_pause"] STEP_PAUSE = _cfg["step_pause"]
ctx = zmq.Context() # Shared state. These stay None until init_zmq() is called.
sock = ctx.socket(zmq.PUB) ctx: zmq.Context = None
sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}") sock: zmq.Socket = None
time.sleep(0.5) _INIT_SETTLE = 0.5 # seconds to let PUB tell subscribers it's alive
log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT}", "info", "zmq")
def init_zmq() -> zmq.Socket:
"""
Bind the PUB socket. Idempotent safe to call more than once.
Call from the main (parent) process only. Do NOT call from multiprocessing
children they inherit nothing useful from the bound socket anyway.
"""
global ctx, sock
if sock is not None:
return sock
ctx = zmq.Context()
sock = ctx.socket(zmq.PUB)
sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
time.sleep(_INIT_SETTLE)
log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT} (pid={os.getpid()})",
"info", "zmq")
return sock
def _ensure_sock() -> zmq.Socket:
if sock is None:
raise RuntimeError(
"zmq_api not initialized — call init_zmq() from the brain "
"entrypoint before using send_vel/send_cmd/gradual_stop"
)
return sock
def get_socket(): def get_socket():
"""Return the shared ZMQ PUB socket (for odometry to reuse).""" """Return the shared ZMQ PUB socket (for odometry to reuse)."""
return sock return _ensure_sock()
def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0): def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0):
"""Send velocity to Holosoma. vx m/s | vy m/s | vyaw rad/s""" """Send velocity to Holosoma. vx m/s | vy m/s | vyaw rad/s"""
sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}})) _ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
def gradual_stop(): def gradual_stop():
"""Smooth deceleration to zero over ~1 second.""" """Smooth deceleration to zero over ~1 second."""
s = _ensure_sock()
for _ in range(STOP_ITERATIONS): for _ in range(STOP_ITERATIONS):
send_vel(0.0, 0.0, 0.0) s.send_string(json.dumps({"vel": {"vx": 0.0, "vy": 0.0, "vyaw": 0.0}}))
time.sleep(STOP_DELAY) time.sleep(STOP_DELAY)
def send_cmd(cmd: str): def send_cmd(cmd: str):
"""Send Holosoma state command: start | walk | stand | stop""" """Send Holosoma state command: start | walk | stand | stop"""
sock.send_string(json.dumps({"cmd": cmd})) _ensure_sock().send_string(json.dumps({"cmd": cmd}))
# Load MOVE_MAP from navigation config # Load MOVE_MAP from navigation config (pure data, safe at import time)
_nav = load_config("Navigation") _nav = load_config("Navigation")
MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()} MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}

View File

@ -292,7 +292,10 @@ class AutonomousMode:
self._enabled = False self._enabled = False
break break
time.sleep(YOLO_CHECK_INTERVAL) # No trailing sleep — _move_forward() takes FORWARD_DURATION,
# _turn() takes TURN_DURATION, and LLaVA assessment is ~1-2s.
# The body always consumes real wall time, so an extra sleep here
# would be pure dead time.
# Clean up # Clean up
self._gradual_stop() self._gradual_stop()

View File

@ -17,7 +17,7 @@ PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if PROJECT_DIR not in sys.path: if PROJECT_DIR not in sys.path:
sys.path.insert(0, PROJECT_DIR) sys.path.insert(0, PROJECT_DIR)
from API.zmq_api import send_vel, gradual_stop, send_cmd from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
from API.camera_api import start_camera, stop_camera, get_frame from API.camera_api import start_camera, stop_camera, get_frame
from API.yolo_api import ( from API.yolo_api import (
init_yolo, yolo_summary, yolo_fps, init_yolo, yolo_summary, yolo_fps,
@ -70,7 +70,19 @@ _NAT_GOAL_RE = re.compile(
# ══════════════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════════════
def init_brain(): def init_brain():
"""Initialize all subsystems. Call once at startup.""" """Initialize all subsystems. Call once at startup from the parent process.
Optional subsystems (lidar / voice / imgsearch / autonomous) are gated on
`config_Brain.json::subsystems.<name>`. Disabling the ones you don't need
brings Marcus's boot time down from ~18 s to ~5-7 s.
"""
subsys = _cfg.get("subsystems", {}) or {}
# Bind the ZMQ PUB socket before anything tries to publish on it.
# This is now explicit (previously it happened as an import side effect,
# which crashed every multiprocessing child that re-imported zmq_api).
init_zmq()
raw_frame, raw_lock = start_camera() raw_frame, raw_lock = start_camera()
init_yolo(raw_frame, raw_lock) init_yolo(raw_frame, raw_lock)
@ -79,53 +91,65 @@ def init_brain():
init_memory() init_memory()
# LiDAR (optional — continues without it) # LiDAR — optional
try: if subsys.get("lidar", True):
from API.lidar_api import init_lidar try:
init_lidar() from API.lidar_api import init_lidar
except Exception as e: init_lidar()
print(f" [LiDAR] Init failed: {e} — continuing without LiDAR") except Exception as e:
print(f" [LiDAR] Init failed: {e} — continuing without LiDAR")
else:
print(" [LiDAR] disabled by config")
init_imgsearch( # Image search — optional
get_frame_fn=get_frame, if subsys.get("imgsearch", False):
send_vel_fn=send_vel, init_imgsearch(
gradual_stop_fn=gradual_stop, get_frame_fn=get_frame,
llava_fn=call_llava, send_vel_fn=send_vel,
yolo_sees_fn=yolo_sees, gradual_stop_fn=gradual_stop,
model=OLLAMA_MODEL, llava_fn=call_llava,
) yolo_sees_fn=yolo_sees,
model=OLLAMA_MODEL,
)
else:
print(" [ImgSearch] disabled by config")
# Autonomous exploration mode # Autonomous exploration mode — optional
from API.memory_api import mem as _mem_ref if subsys.get("autonomous", True):
from API.llava_api import PATROL_PROMPT from API.memory_api import mem as _mem_ref
auto = AutonomousMode( from API.llava_api import PATROL_PROMPT
get_frame_fn=get_frame, auto = AutonomousMode(
send_vel_fn=send_vel, get_frame_fn=get_frame,
gradual_stop_fn=gradual_stop, send_vel_fn=send_vel,
yolo_sees_fn=yolo_sees, gradual_stop_fn=gradual_stop,
yolo_summary_fn=yolo_summary, yolo_sees_fn=yolo_sees,
yolo_all_classes_fn=yolo_all_classes, yolo_summary_fn=yolo_summary,
yolo_closest_fn=yolo_closest, yolo_all_classes_fn=yolo_all_classes,
odom_fn=lambda: {"x": 0, "y": 0, "heading": 0}, # fallback if no odom yolo_closest_fn=yolo_closest,
call_llava_fn=call_llava, odom_fn=lambda: {"x": 0, "y": 0, "heading": 0},
patrol_prompt=PATROL_PROMPT, call_llava_fn=call_llava,
mem=_mem_ref, patrol_prompt=PATROL_PROMPT,
) mem=_mem_ref,
# Wire odometry if available )
from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE
if _odom_ref and ODOM_AVAILABLE: if _odom_ref and ODOM_AVAILABLE:
auto._odom_pos = lambda: { auto._odom_pos = lambda: {
"x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading "x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading
} }
init_autonomous(auto) init_autonomous(auto)
else:
print(" [Autonomous] disabled by config")
send_cmd("start") send_cmd("start")
time.sleep(0.5) time.sleep(0.5)
send_cmd("walk") send_cmd("walk")
time.sleep(0.5) time.sleep(0.5)
# Voice module (optional — continues without it) # Voice module — optional
_init_voice() if subsys.get("voice", True):
_init_voice()
else:
print(" [Voice] disabled by config")
_log("Brain initialized", "info", "brain") _log("Brain initialized", "info", "brain")
_warmup_llava() _warmup_llava()
@ -137,44 +161,37 @@ _voice_module = None
def _init_voice(): def _init_voice():
"""Initialize voice module — runs in background, calls process_command on speech.""" """
Initialize the voice subsystem: G1 built-in mic + Whisper STT + G1
built-in TtsMaker for replies. Every transcribed command flows through
process_command(), and the resulting `speak` string is sent to the G1
speaker.
"""
global _audio_api, _voice_module global _audio_api, _voice_module
try: try:
from API.audio_api import AudioAPI from API.audio_api import AudioAPI
from Voice.marcus_gemini_voice import GeminiVoiceModule as VoiceModule from Voice.marcus_voice import VoiceModule
_audio_api = AudioAPI() _audio_api = AudioAPI()
def _voice_callback(text, role): def _on_command(text, lang):
"""Gemini voice callback.""" text = (text or "").strip()
pass # handled below if not text:
if role != "user" or not text.strip():
return return
t = text.strip().lower() print(f" [Voice] {text}")
act_kw = ["turn","move","go","walk","step","stop","come","wave","clap", try:
"high five","shake","hug","forward","backward","left","right", result = process_command(text)
"what do you see","what can you see","look","describe","patrol", except Exception as e:
"دور","امشي","روح","تقدم","ارجع","وقف","قف","تعال", print(f" [Brain] Error processing voice command: {e}")
"يمين","يسار","قدام","ورا","لوح","صفق","سلم", return
"شو شايف","شو تشوف","ماذا ترى","شو قدامك","لف","خطوات"] if isinstance(result, dict):
if any(kw in t for kw in act_kw): sp = (result.get("speak") or "").strip()
print(f" [Brain] Action: {text.strip()}") if sp and _audio_api:
try: _audio_api.speak(sp)
result = process_command(text.strip())
if isinstance(result, dict):
sp = result.get("speak", "")
vis_kw = ["see","look","describe","شايف","تشوف","ترى","قدامك"]
if any(k in t for k in vis_kw) and sp and _audio_api:
print(f" [Brain] Vision: {sp}")
_audio_api.speak(sp)
except Exception as e:
print(f" [Brain] Error: {e}")
else:
print(f" [Chat] {text.strip()}")
_voice_module = VoiceModule(_audio_api, on_transcript=_voice_callback) _voice_module = VoiceModule(_audio_api, on_command=_on_command)
_voice_module.start() _voice_module.start()
print(f" [Voice] Always listening (Gemini voice)") print(" [Voice] Always listening (Whisper + G1 mic + TtsMaker)")
except Exception as e: except Exception as e:
print(f" [Voice] Init failed: {e} — continuing without voice") print(f" [Voice] Init failed: {e} — continuing without voice")
_audio_api = None _audio_api = None
@ -255,7 +272,7 @@ def process_command(cmd: str) -> dict:
# ── Greeting ───────────────────────────────────────────────────────── # ── Greeting ─────────────────────────────────────────────────────────
if re.match(r"^(?:hi+|hey+|hello+|hola|salam|marhaba|sup|yo+|ahlan|السلام عليكم|مرحبا|أهلا|هلا|يا هلا)\s*[!.]*$", cmd, re.IGNORECASE): if re.match(r"^(?:hi+|hey+|hello+|hola|salam|marhaba|sup|yo+|ahlan|السلام عليكم|مرحبا|أهلا|هلا|يا هلا)\s*[!.]*$", cmd, re.IGNORECASE):
response = "Hello! I am Marcus. How can I help you?" response = "Hello! I am Sanad. How can I help you?"
print(f"Marcus: {response}") print(f"Marcus: {response}")
add_to_history(cmd, response) add_to_history(cmd, response)
log_cmd(cmd, response) log_cmd(cmd, response)
@ -346,10 +363,15 @@ def _handle_llava(cmd):
t0 = time.time() t0 = time.time()
img = get_frame() img = get_frame()
# Poll up to 500 ms in 50 ms slices instead of blocking a full second.
# Returns the moment a frame is available — most drops recover in <100 ms.
if img is None: if img is None:
print(" Waiting for camera...") print(" Waiting for camera...")
time.sleep(1.0) for _ in range(10):
img = get_frame() time.sleep(0.05)
img = get_frame()
if img is not None:
break
if img is None: if img is None:
print(" Camera not ready — command cancelled") print(" Camera not ready — command cancelled")
@ -461,7 +483,7 @@ def run_terminal():
status = get_brain_status() status = get_brain_status()
print() print()
print("=" * 48) print("=" * 48)
print(" MARCUS AI BRAIN — READY") print(" SANAD AI BRAIN — READY")
print("=" * 48) print("=" * 48)
for k, v in status.items(): for k, v in status.items():
print(f" {k:<10}: {v}") print(f" {k:<10}: {v}")

View File

@ -3,13 +3,19 @@
"max_history": 6, "max_history": 6,
"num_batch": 128, "num_batch": 128,
"num_ctx": 2048, "num_ctx": 2048,
"num_predict_main": 200, "subsystems": {
"lidar": true,
"voice": true,
"imgsearch": false,
"autonomous": true
},
"num_predict_main": 120,
"num_predict_goal": 80, "num_predict_goal": 80,
"num_predict_patrol": 100, "num_predict_patrol": 100,
"num_predict_talk": 80, "num_predict_talk": 80,
"num_predict_verify": 10, "num_predict_verify": 10,
"warmup_num_predict": 5, "warmup_num_predict": 5,
"main_prompt": "You are Marcus, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\" -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\" -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\" -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\" -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:", "main_prompt": "You are Sanad, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\" -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\" -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\" -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\" -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:",
"goal_prompt": "You are Marcus navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:", "goal_prompt": "You are Sanad navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:",
"patrol_prompt": "You are Marcus, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:" "patrol_prompt": "You are Sanad, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:"
} }

View File

@ -1,53 +1,44 @@
{ {
"tts": { "tts": {
"piper_voice_ar": "ar_JO-kareem-medium", "backend": "builtin_ttsmaker",
"piper_voice_en": "en_US-lessac-medium",
"piper_sample_rate": 22050,
"builtin_speaker_id": 0, "builtin_speaker_id": 0,
"target_sample_rate": 16000, "target_sample_rate": 16000
"piper_timeout_sec": 120,
"en_backend": "edge_tts",
"ar_backend": "edge_tts",
"edge_voice_ar": "ar-AE-HamdanNeural",
"edge_voice_en": "en-US-GuyNeural"
}, },
"stt": { "stt": {
"wake_model": "tiny", "wake_model": "tiny",
"command_model": "small", "command_model": "small",
"wake_words_en": [ "wake_words_en": ["sanad", "sannad", "sanat", "sunnat"],
"marcus", "language": "en",
"marcos",
"markus"
],
"wake_words_ar": [
"ماركوس",
"مارکوس",
"ماركس"
],
"language": null,
"command_timeout_sec": 10, "command_timeout_sec": 10,
"silence_threshold": 500, "silence_threshold": 500,
"silence_duration_sec": 1.5, "silence_duration_sec": 1.5,
"max_record_sec": 15 "max_record_sec": 15
}, },
"mic": { "mic": {
"backend": "builtin_udp",
"source_index": "3", "source_index": "3",
"format": "s16le", "format": "s16le",
"rate": 16000, "rate": 16000,
"channels": 1 "channels": 1
}, },
"mic_udp": {
"group": "239.168.123.161",
"port": 5555,
"buffer_max_bytes": 64000,
"read_timeout_sec": 0.04
},
"speaker": { "speaker": {
"dds_interface": "eth0", "dds_interface": "eth0",
"volume": 100, "volume": 100,
"app_name": "marcus" "app_name": "sanad"
}, },
"audio": { "audio": {
"data_dir": "Data/Voice/Recordings", "data_dir": "Data/Voice/Recordings",
"log_file": "logs/voice.log" "log_file": "logs/voice.log"
}, },
"messages": { "messages": {
"wake_heard": "Listening...", "wake_heard": "Listening",
"no_speech": "I didn't catch that. Say my name again.", "no_speech": "I didn't catch that, please say it again",
"error_tts": "Speech synthesis failed", "error_tts": "Speech synthesis failed",
"error_mic": "Microphone error", "error_mic": "Microphone error",
"ready": "Voice system ready" "ready": "Voice system ready"

View File

@ -26,7 +26,7 @@
# ══════════════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════════════
main_prompt: | main_prompt: |
You are Marcus — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai. You are Sanad — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai.
You are physically present in the room. You have a body, arms, and a camera. You are physically present in the room. You have a body, arms, and a camera.
You follow commands from your operator and respond intelligently. You follow commands from your operator and respond intelligently.
{facts} {facts}
@ -184,7 +184,7 @@ main_prompt: |
→ {{"actions":[],"arm":null,"speak":"Nice to meet you Kassam, I will remember that","abort":null}} → {{"actions":[],"arm":null,"speak":"Nice to meet you Kassam, I will remember that","abort":null}}
"who are you" "who are you"
→ {{"actions":[],"arm":null,"speak":"I am Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}} → {{"actions":[],"arm":null,"speak":"I am Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
Safety: Safety:
"walk into the wall" "walk into the wall"
@ -307,7 +307,7 @@ main_prompt: |
# ══════════════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════════════
goal_prompt: | goal_prompt: |
You are Marcus, a humanoid robot actively navigating toward a specific target. You are Sanad, a humanoid robot actively navigating toward a specific target.
YOUR MISSION: "{goal}" YOUR MISSION: "{goal}"
@ -392,7 +392,7 @@ goal_prompt: |
# ══════════════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════════════
patrol_prompt: | patrol_prompt: |
You are Marcus, a humanoid robot autonomously exploring and mapping an office environment. You are Sanad, a humanoid robot autonomously exploring and mapping an office environment.
Your mission: move through the space intelligently, identify areas and objects, Your mission: move through the space intelligently, identify areas and objects,
and build a spatial understanding of the layout. and build a spatial understanding of the layout.
@ -463,7 +463,7 @@ patrol_prompt: |
# ══════════════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════════════
talk_prompt: | talk_prompt: |
You are Marcus, a humanoid robot assistant. You have been asked a question You are Sanad, a humanoid robot assistant. You have been asked a question
or given information. Do NOT move — just respond intelligently. or given information. Do NOT move — just respond intelligently.
{facts} {facts}
@ -509,7 +509,7 @@ talk_prompt: |
→ {{"actions":[],"arm":null,"speak":"All systems are functioning normally and I am ready to assist you","abort":null}} → {{"actions":[],"arm":null,"speak":"All systems are functioning normally and I am ready to assist you","abort":null}}
"what is your name" "what is your name"
→ {{"actions":[],"arm":null,"speak":"My name is Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}} → {{"actions":[],"arm":null,"speak":"My name is Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
"who built you" "who built you"
→ {{"actions":[],"arm":null,"speak":"I was built by YS Lootah Technology in Dubai, and my programmer is Kassam","abort":null}} → {{"actions":[],"arm":null,"speak":"I was built by YS Lootah Technology in Dubai, and my programmer is Kassam","abort":null}}

View File

@ -1,9 +1,13 @@
""" """
logger.py Project-wide logging via Logger.py logger.py Project-wide configured logging instance.
Imports the `Logs` backend class from log_backend.py (formerly Logger.py;
renamed to avoid a case-only filename collision with this module, which
breaks any case-insensitive filesystem macOS default HFS+/APFS, Windows).
""" """
import os import os
from Core.env_loader import PROJECT_ROOT from Core.env_loader import PROJECT_ROOT
from Core.Logger import Logs from Core.log_backend import Logs
# Single shared instance — all modules use this # Single shared instance — all modules use this
_logs = Logs(main_log_file=os.path.join(PROJECT_ROOT, "logs", "main.log")) _logs = Logs(main_log_file=os.path.join(PROJECT_ROOT, "logs", "main.log"))

View File

@ -1,8 +1,37 @@
# Marcus — Full API & Developer Reference # Marcus — Full API & Developer Reference
**Project:** Marcus | YS Lootah Technology | Jetson Orin NX + G1 EDU **Project:** Marcus | YS Lootah Technology | Jetson Orin NX + G1 EDU
**Scripts:** `~/Models_marcus/marcus_llava.py` + `~/Models_marcus/marcus_yolo.py` **Robot persona:** Sanad (wake word + self-intro; project code stays under `Marcus/`)
**Updated:** April 4, 2026 **Entry points:** `run_marcus.py` (terminal) / `Server/marcus_server.py` (WebSocket)
**Updated:** 2026-04-21
> **What changed since the early draft (April 4):** The project was restructured
> from two monolithic scripts (`marcus_llava.py` + `marcus_yolo.py`) into a
> layered architecture. See `Doc/architecture.md` for the current file tree and
> `Doc/environment.md` for the verified Jetson software stack, exact library
> versions, and GPU bring-up recipe. This reference still describes the
> function-level semantics (inputs/outputs/examples) — treat any file path in
> this document as illustrative and cross-check the actual module. Recent
> deltas called out inline below.
### Recent API deltas (2026-04-21)
| Change | Location | Note |
|---|---|---|
| GPU is mandatory for YOLO | `Config/config_Vision.json`, `Vision/marcus_yolo.py` | `yolo_device` defaults to `"cuda"` and is enforced; `_resolve_device()` raises `RuntimeError` on missing CUDA. `yolo_half=true` runs FP16 on Orin (capability 8.7). |
| Ollama model | `Config/config_Brain.json` | Default `ollama_model` is `qwen2.5vl:3b` (not `llava:7b`). |
| Ollama compute-graph caps | `Config/config_Brain.json` | `num_batch=128`, `num_ctx=2048` — required on 16 GB Orin NX to prevent the llama runner OOM. Propagated by `API/llava_api.py` and `Vision/marcus_imgsearch.py` to every `ollama.chat` call. |
| `num_predict_main` lowered | `Config/config_Brain.json` | 200 → 120 (shaves ~400600 ms per open-ended command; JSON still parses). |
| ZMQ bind moved out of import | `API/zmq_api.py` | `init_zmq()` must be called from the main process before any `send_vel/send_cmd`. `init_brain()` does this. Children spawned via `multiprocessing` no longer collide on port 5556. |
| Camera-retry poll | `Brain/marcus_brain.py::_handle_llava` | Replaced `time.sleep(1.0)` with 10×50 ms polls. |
| Conditional scan sleeps | `Navigation/goal_nav.py`, `Autonomous/marcus_autonomous.py` | Removed unconditional per-step naps when real work (YOLO hit, LLaVA call, forward move) already consumed wall time. |
| Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
| Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
| Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
| Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. |
| Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
| Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
| Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |
--- ---
@ -22,38 +51,54 @@
12. [JSON Schema Reference](#12-json-schema-reference) 12. [JSON Schema Reference](#12-json-schema-reference)
13. [Environment & Paths](#13-environment--paths) 13. [Environment & Paths](#13-environment--paths)
14. [Quick Reference Card](#14-quick-reference-card) 14. [Quick Reference Card](#14-quick-reference-card)
15. [Voice API (mic + TTS + STT)](#15-voice-api-mic--tts--stt)
--- ---
## 1. Configuration Variables ## 1. Configuration Variables
Defined at the top of `marcus_llava.py`. Edit here to change global behavior. All configuration is now **JSON-driven** and lives under `Config/`. Each module
loads its config at startup via `Core.config_loader.load_config(name)`.
| Variable | Default | Description | **`Config/config_ZMQ.json`** (Holosoma bridge)
|----------|---------|-------------|
| `ZMQ_HOST` | `"127.0.0.1"` | Holosoma ZMQ host |
| `ZMQ_PORT` | `5556` | Holosoma ZMQ port |
| `ZMQ_YOLO_PORT` | `5557` | YOLO ZMQ port (standalone mode) |
| `OLLAMA_MODEL` | `"llava:7b"` | LLaVA model via Ollama |
| `CAM_WIDTH` | `424` | Camera capture width (px) |
| `CAM_HEIGHT` | `240` | Camera capture height (px) |
| `CAM_FPS` | `15` | Camera frame rate |
| `CAM_QUALITY` | `70` | JPEG quality sent to LLaVA |
| `STOP_ITERATIONS` | `20` | gradual_stop message count |
| `STOP_DELAY` | `0.05` | seconds between stop messages |
| `STEP_PAUSE` | `0.3` | pause between consecutive action steps |
| `ARM_SDK_PATH` | `/home/unitree/unitree_sdk2_python` | Arm SDK path |
| `ARM_INTERFACE` | `"eth0"` | Network interface for arm SDK |
Defined at top of `marcus_yolo.py`: | Key | Default | Description |
|---|---|---|
| `zmq_host` | `"127.0.0.1"` | Holosoma ZMQ host |
| `zmq_port` | `5556` | Holosoma ZMQ port |
| `stop_iterations` | `20` | `gradual_stop()` message count |
| `stop_delay` | `0.05` | seconds between stop messages |
| `step_pause` | `0.3` | pause between consecutive action steps |
| Variable | Default | Description | **`Config/config_Brain.json`** (Ollama VL model)
|----------|---------|-------------|
| `YOLO_MODEL_PATH` | `.../Model/yolov8m.pt` | YOLO model path | | Key | Default | Description |
| `YOLO_CONFIDENCE` | `0.45` | Minimum detection confidence | |---|---|---|
| `YOLO_IOU` | `0.45` | NMS IOU threshold | | `ollama_model` | `"qwen2.5vl:3b"` | Ollama model tag |
| `YOLO_DEVICE` | `"cpu"` | Inference device ("cpu" or "cuda") | | `max_history` | `6` | conversation turns retained |
| `YOLO_IMG_SIZE` | `320` | Inference image size (smaller = faster) | | `num_batch` | `128` | llama.cpp batch — **cap, required for Jetson** |
| `num_ctx` | `2048` | llama.cpp KV context length — **cap, required for Jetson** |
| `num_predict_main` | `120` | max tokens for the main command path |
| `num_predict_goal` | `80` | goal-navigation call |
| `num_predict_patrol` | `100` | autonomous patrol call |
| `num_predict_talk` | `80` | talk-only path |
| `num_predict_verify` | `10` | YOLO condition verifier (`yes`/`no`) |
**`Config/config_Vision.json`** (YOLO)
| Key | Default | Description |
|---|---|---|
| `yolo_model_path` | `"Models/yolov8m.pt"` | weights file (auto-fetched if missing) |
| `yolo_confidence` | `0.45` | detection confidence threshold |
| `yolo_iou` | `0.45` | NMS IOU threshold |
| `yolo_device` | `"cuda"` | **GPU required**`"cpu"` raises `RuntimeError` |
| `yolo_half` | `true` | FP16 inference (Ampere tensor cores) |
| `yolo_img_size` | `320` | inference image size |
| `tracked_classes` | 19 COCO classes | filter for relevant detections |
**`Config/config_Camera.json`**: `424x240 @ 15 fps`, `JPEG quality 70`.
**`Config/config_Voice.json`**: see section 6 below.
**`Config/config_Network.json`**: Jetson eth0/wlan0 IPs, WebSocket port.
--- ---
@ -61,20 +106,28 @@ Defined at top of `marcus_yolo.py`:
### Setup ### Setup
The bind is no longer an import-time side effect. It runs inside `init_zmq()`, called once by `init_brain()` from the main process. Children (e.g. the LiDAR SLAM worker spawned via `multiprocessing.spawn`) can re-import `API.zmq_api` without rebinding.
```python ```python
ctx = zmq.Context() # API/zmq_api.py — bind happens here, not at module import
sock = ctx.socket(zmq.PUB) def init_zmq() -> zmq.Socket:
sock.bind("tcp://127.0.0.1:5556") global ctx, sock
time.sleep(0.5) if sock is not None:
return sock # idempotent
ctx = zmq.Context()
sock = ctx.socket(zmq.PUB)
sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
time.sleep(0.5) # let SUBs attach
return sock
``` ```
### `send_vel(vx, vy, vyaw)` ### `send_vel(vx, vy, vyaw)`
Send velocity command to Holosoma. Send velocity command to Holosoma. Raises `RuntimeError` if `init_zmq()` wasn't called.
```python ```python
def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0): def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0):
sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}})) _ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
``` ```
| Parameter | Unit | Safe range | Effect | | Parameter | Unit | Safe range | Effect |
@ -661,14 +714,17 @@ from unitree_sdk2py.g1.arm.g1_arm_action_client import G1ArmActionClient # Arm
``` ```
STARTUP: STARTUP:
Tab 1: source ~/.holosoma_deps/miniconda3/bin/activate hsinference Tab 1 (hsinference env): Holosoma locomotion policy
cd ~/holosoma && sudo jetson_clocks
python3 run_policy.py inference:g1-29dof-loco \ python3 run_policy.py inference:g1-29dof-loco \
--task.velocity-input zmq --task.state-input zmq --task.interface eth0 --task.velocity-input zmq --task.state-input zmq --task.interface eth0
Tab 2: ollama serve & Tab 2: ollama serve > /tmp/ollama.log 2>&1 &
/home/unitree/miniconda3/envs/marcus/bin/python3 ~/Models_marcus/marcus_llava.py sleep 3
(YOLO starts automatically — no Tab 3 needed)
Tab 3 (marcus env): conda activate marcus && cd ~/Marcus && python3 run_marcus.py
(YOLO + voice + LiDAR all start automatically per subsystems flags)
WAKE WORD: "Sanad"
COMMANDS: COMMANDS:
walk forward · turn right · turn left · move back walk forward · turn right · turn left · move back
@ -704,4 +760,74 @@ SAFETY:
--- ---
## 15. Voice API (mic + TTS + STT)
New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack.
### Mic — `Voice.builtin_mic.BuiltinMic`
Captures the G1's on-board array microphone over UDP multicast. No USB mic required. 16 kHz mono int16 PCM natively; no resampling needed.
```python
from Voice.builtin_mic import BuiltinMic
mic = BuiltinMic(group="239.168.123.161", port=5555, buf_max=64_000)
mic.start()
try:
pcm = mic.read_chunk(1024) # 512 samples, ~32 ms, int16 mono
# or
pcm = mic.read_seconds(3.0)
finally:
mic.stop()
```
Config under `config_Voice.json::mic_udp`.
### TTS — `Voice.builtin_tts.BuiltinTTS`
Wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker`. English only — refuses non-ASCII input.
```python
from Voice.builtin_tts import BuiltinTTS
tts = BuiltinTTS(audio_client, default_speaker_id=0)
tts.speak("Hello, I am Sanad", block=True) # synth + play on G1 body speaker
```
Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
### Wake + command loop — `Voice.marcus_voice.VoiceModule`
Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands.
```python
from API.audio_api import AudioAPI
from Voice.marcus_voice import VoiceModule
def on_command(text, lang):
print(f"heard: {text}")
audio = AudioAPI()
voice = VoiceModule(audio, on_command=on_command)
voice.start() # background thread
# ... later ...
voice.stop()
```
Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`.
### AudioAPI — `API.audio_api.AudioAPI`
Orchestration layer. Owns the `AudioClient`, manages mute/unmute, exposes a clean `speak` + `record` API.
```python
from API.audio_api import AudioAPI
audio = AudioAPI()
audio.speak("Hello") # English only; non-ASCII returns early
pcm = audio.record(seconds=5) # int16 mono 16 kHz — uses BuiltinMic
audio.play_pcm(pcm) # raw PCM playback via Unitree RPC
```
Config: `config_Voice.json::tts.backend = "builtin_ttsmaker"`, `mic.backend = "builtin_udp"` (or `"pactl_parec"` to fall back to Hollyland).
---
*Marcus — YS Lootah Technology | Kassam | April 2026* *Marcus — YS Lootah Technology | Kassam | April 2026*

Binary file not shown.

Binary file not shown.

View File

@ -1,20 +1,39 @@
# Marcus — System Architecture # Marcus — System Architecture
**Project**: Marcus | YS Lootah Technology **Project**: Marcus | YS Lootah Technology
**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX (16GB) **Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB
**Updated**: 2026-04-06 **Robot persona**: **Sanad** (wake word + self-intro; project code still lives under `Marcus/`)
**Updated**: 2026-04-21
---
## Recent deltas (since 2026-04-06)
- **GPU-only YOLO**`_resolve_device()` raises `RuntimeError` if CUDA is missing. `yolo_device=cuda`, `yolo_half=true` by default.
- **Ollama compute-graph caps**`num_batch=128`, `num_ctx=2048` in `config_Brain.json` (otherwise llama.cpp OOMs on the 16 GB Jetson).
- **`num_predict_main: 120`** (was 200) — saves ~400-600 ms per open-ended command.
- **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555``Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic.
- **G1 built-in TTS** via `client.TtsMaker()``Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
- **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`).
- **Subsystem flags**`config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
- **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
- **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
- **Robot persona = "Sanad"** — wake words, prompts, banner, and self-intro all use "Sanad". Project identity ("Marcus") remains in file names, class names, directory, logs.
See `Doc/environment.md` for the verified Jetson software stack and `Doc/pipeline.md` for the end-to-end data flow.
--- ---
## Overview ## Overview
Marcus is a fully offline humanoid robot AI system. The brain runs on Jetson Orin NX with no cloud dependencies. It uses vision-language models (Qwen2.5-VL via Ollama) for understanding commands, YOLO for real-time object detection, dead reckoning for position tracking, and persistent memory across sessions. Marcus is a mostly-offline humanoid robot AI system. The brain runs on Jetson Orin NX using a local vision-language model (Qwen2.5-VL via Ollama) for open-ended commands, YOLOv8m for real-time object detection (CUDA + FP16), dead reckoning + optional ROS2 odometry for pose, Livox Mid-360 LiDAR + a custom SLAM worker for mapping, and persistent memory across sessions.
Two operating modes: Two operating modes:
- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson - **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson. Voice subsystem runs alongside by default.
- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients - **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients.
Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control. Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control. Voice, LiDAR, image-search and autonomous-patrol are gated behind `config_Brain.json::subsystems` flags.
--- ---
@ -28,14 +47,14 @@ Marcus/
├── Core/ # Foundation layer — no external deps ├── Core/ # Foundation layer — no external deps
│ ├── env_loader.py # Reads .env, resolves PROJECT_ROOT │ ├── env_loader.py # Reads .env, resolves PROJECT_ROOT
│ ├── config_loader.py # load_config(name) → reads Config/config_{name}.json │ ├── config_loader.py # load_config(name) → reads Config/config_{name}.json
│ ├── Logger.py # Logging engine (file-based, no console output) │ ├── log_backend.py # Logging engine (file-based, no console output) — was Logger.py
│ └── logger.py # Project wrapper: log(), log_and_print(), get_logger() │ └── logger.py # Project wrapper: log(), log_and_print(), get_logger()
├── Config/ # ALL configuration — one JSON per module ├── Config/ # ALL configuration — one JSON per module
│ ├── config_ZMQ.json # ZMQ host, port, stop params │ ├── config_ZMQ.json # ZMQ host, port, stop params
│ ├── config_Camera.json # RealSense resolution, fps, quality │ ├── config_Camera.json # RealSense resolution, fps, quality
│ ├── config_Brain.json # Ollama model, prompts, num_predict values │ ├── config_Brain.json # Ollama model, prompts, num_predict, num_batch/ctx, subsystems
│ ├── config_Vision.json # YOLO model path, confidence, tracked classes │ ├── config_Vision.json # YOLO model path, device=cuda, half=true, confidence, tracked classes
│ ├── config_Navigation.json # move_map, goal aliases, YOLO goal classes │ ├── config_Navigation.json # move_map, goal aliases, YOLO goal classes
│ ├── config_Patrol.json # patrol duration, proximity threshold │ ├── config_Patrol.json # patrol duration, proximity threshold
│ ├── config_Arm.json # arm actions, aliases, availability flag │ ├── config_Arm.json # arm actions, aliases, availability flag
@ -43,17 +62,26 @@ Marcus/
│ ├── config_Memory.json # session/places paths │ ├── config_Memory.json # session/places paths
│ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports │ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports
│ ├── config_ImageSearch.json # search defaults │ ├── config_ImageSearch.json # search defaults
│ └── marcus_prompts.yaml # All LLaVA/Qwen prompts (main, goal, patrol, talk, verify) │ ├── config_Voice.json # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port
│ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params
│ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify)
├── API/ # Interface layer — one file per subsystem ├── API/ # Interface layer — one file per subsystem
│ ├── zmq_api.py # ZMQ PUB socket: send_vel(), gradual_stop(), send_cmd() │ ├── zmq_api.py # ZMQ PUB socket: init_zmq(), send_vel(), gradual_stop(), send_cmd()
│ ├── camera_api.py # RealSense thread: start/stop_camera(), get_frame() │ ├── camera_api.py # RealSense thread: start/stop_camera(), get_frame()
│ ├── llava_api.py # LLaVA queries: call_llava(), ask(), ask_goal(), ask_patrol() │ ├── llava_api.py # Qwen2.5-VL queries via Ollama: call_llava(), ask(), ask_goal()…
│ ├── yolo_api.py # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()... │ ├── yolo_api.py # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()
│ ├── odometry_api.py # Odometry wrapper: init_odometry(), get_position() │ ├── odometry_api.py # Odometry wrapper: init_odometry(), get_position()
│ ├── memory_api.py # Memory wrapper: init_memory(), log_cmd(), place_save/goto() │ ├── memory_api.py # Memory wrapper: init_memory(), log_cmd(), place_save/goto()
│ ├── arm_api.py # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES │ ├── arm_api.py # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES (stub)
│ └── imgsearch_api.py # Image search wrapper: init_imgsearch(), get_searcher() │ ├── imgsearch_api.py # Image search wrapper: init_imgsearch(), get_searcher()
│ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
│ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
├── Voice/ # Mic + TTS + wake-word STT
│ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555
│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id)
│ └── marcus_voice.py # VoiceModule — Whisper tiny (wake) + small (command) state machine
├── Brain/ # Decision logic — imports ONLY from API/ ├── Brain/ # Decision logic — imports ONLY from API/
│ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal() │ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal()
@ -127,39 +155,40 @@ Marcus/
│ Server/marcus_server.py (WebSocket) │ │ Server/marcus_server.py (WebSocket) │
└──────────────────┬──────────────────────────────┘ └──────────────────┬──────────────────────────────┘
┌──────────────────▼──────────────────────────────┐ ┌──────────────────▼──────────────────────────────────┐
│ Brain Layer │ │ Brain Layer │
│ marcus_brain.py — init_brain() │ │ marcus_brain.py — init_brain() / process_command │
│ — process_command(cmd) │ │ command_parser.py — regex-table local commands │
│ command_parser.py — 14 regex local commands │ │ executor.py — execute Qwen-VL decisions │
│ executor.py — execute LLaVA decisions │ │ marcus_memory.py — session + place memory │
│ marcus_memory.py — session + place memory │ └──────────────────┬──────────────────────────────────┘
└──────────────────┬──────────────────────────────┘
│ imports only from API/ │ imports only from API/
┌──────────────────▼──────────────────────────────┐ ┌──────────────────▼──────────────────────────────────┐
│ API Layer │ │ API Layer │
│ zmq_api camera_api llava_api │ │ zmq_api camera_api llava_api audio_api │
│ yolo_api odometry_api memory_api │ │ yolo_api odometry_api memory_api imgsearch_api │
│ arm_api imgsearch_api │ │ arm_api lidar_api │
└──────────────────┬──────────────────────────────┘ └──────────────┬───────────────────────┬──────────────┘
│ wraps │ wraps │ wraps
┌──────────────────▼──────────────────────────────┐ ┌──────────────▼───────────┐ ┌────────▼────────────────┐
│ Navigation / Vision │ │ Navigation / Vision │ │ Voice │
│ goal_nav.py marcus_yolo.py │ │ goal_nav.py │ │ builtin_mic.py │
│ patrol.py marcus_imgsearch.py │ │ patrol.py │ │ builtin_tts.py │
│ marcus_odometry.py │ │ marcus_odometry.py │ │ marcus_voice.py │
└──────────────────┬──────────────────────────────┘ │ marcus_yolo.py │ │ (Whisper + TtsMaker) │
│ marcus_imgsearch.py │ └──────────┬──────────────┘
┌──────────────────▼──────────────────────────────┐ └──────────────┬───────────┘ │
│ Core Layer │ │ │
│ env_loader.py config_loader.py │ ┌──────────────▼─────────────────────────▼────────────┐
│ Logger.py logger.py │ │ Core Layer │
└──────────────────┬──────────────────────────────┘ │ env_loader.py config_loader.py │
│ log_backend.py logger.py │
└──────────────────┬──────────────────────────────────┘
│ reads │ reads
┌──────────────────▼──────────────────────────────┐ ┌──────────────────▼──────────────────────────────────
│ Config / .env │ Config / .env
│ 11 JSON files + marcus_prompts.yaml │ 13 JSON files + marcus_prompts.yaml
└─────────────────────────────────────────────────┘ └──────────────────────────────────────────────────────
``` ```
**Rule**: Brain never imports from Vision/ or Navigation/ directly. It goes through the API layer. **Rule**: Brain never imports from Vision/ or Navigation/ directly. It goes through the API layer.
@ -176,11 +205,11 @@ Reads `.env` from the project root to resolve `PROJECT_ROOT`. Uses a minimal bui
#### `config_loader.py` (30 lines) #### `config_loader.py` (30 lines)
`load_config(name)` reads `Config/config_{name}.json` and caches the result. All modules call this instead of hardcoding constants. Also provides `config_path(relative)` to resolve relative paths (e.g., `"Models/yolov8m.pt"`) to absolute paths from PROJECT_ROOT. `load_config(name)` reads `Config/config_{name}.json` and caches the result. All modules call this instead of hardcoding constants. Also provides `config_path(relative)` to resolve relative paths (e.g., `"Models/yolov8m.pt"`) to absolute paths from PROJECT_ROOT.
#### `Logger.py` (186 lines) #### `log_backend.py` (186 lines, was `Logger.py`)
Full logging engine from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery. Full logging engine ported from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery. Renamed from `Logger.py` on 2026-04-21 to eliminate a case-only collision with `logger.py` that prevented the repo from cloning on case-insensitive filesystems (macOS/Windows).
#### `logger.py` (51 lines) #### `logger.py` (51 lines)
Project wrapper around `Logger.py`. Provides: Project wrapper around `log_backend.Logs`. Provides:
- `log(message, level, module)` — write to `logs/{module}.log` - `log(message, level, module)` — write to `logs/{module}.log`
- `log_and_print(message, level, module)` — write + print - `log_and_print(message, level, module)` — write + print
- `get_logger(module)` — get configured Logs instance - `get_logger(module)` — get configured Logs instance
@ -191,12 +220,15 @@ Project wrapper around `Logger.py`. Provides:
Each API file wraps one subsystem. They read their own config via `load_config()`, handle import errors gracefully with fallback stubs, and export clean public functions. Each API file wraps one subsystem. They read their own config via `load_config()`, handle import errors gracefully with fallback stubs, and export clean public functions.
#### `zmq_api.py` (49 lines) #### `zmq_api.py` (~75 lines)
Creates a ZMQ PUB socket on startup (binds to `tcp://127.0.0.1:{zmq_port}`). Holosoma's RL policy connects to this socket as SUB and receives velocity commands at 50Hz. Holds the ZMQ PUB socket used to drive Holosoma at 50 Hz. **The bind is not a module import side effect any more** — it runs only when `init_zmq()` is called from the main (parent) process. This lets the LiDAR SLAM worker (spawned via `multiprocessing.spawn`) re-import the module without rebinding port 5556 and crashing.
**Exports:** **Exports:**
- `init_zmq()` — idempotent bind, called once by `init_brain()`
- `send_vel(vx, vy, vyaw)` — send velocity to Holosoma - `send_vel(vx, vy, vyaw)` — send velocity to Holosoma
- `gradual_stop()` — 20 zero-velocity messages over 1 second - `gradual_stop()` — 20 zero-velocity messages over 1 second
- `send_cmd(cmd)` — Holosoma state machine (`start` / `walk` / `stand` / `stop`)
- `get_socket()` — access the bound socket (for odometry to reuse)
- `send_cmd(cmd)` — send state command: "start", "walk", "stand", "stop" - `send_cmd(cmd)` — send state command: "start", "walk", "stand", "stop"
- `get_socket()` — return the shared PUB socket (for odometry to reuse) - `get_socket()` — return the shared PUB socket (for odometry to reuse)
- `MOVE_MAP` — direction-to-velocity lookup: `{"forward": (0.3, 0, 0), "left": (0, 0, 0.3), ...}` - `MOVE_MAP` — direction-to-velocity lookup: `{"forward": (0.3, 0, 0), "left": (0, 0, 0.3), ...}`
@ -440,6 +472,37 @@ Supports text-only search (no reference image) using hint description.
--- ---
### Voice/
Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
#### `builtin_mic.py` (~180 lines, new 2026-04-21)
Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer.
**Exports:**
- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent)
- `start()` / `stop()` — socket lifecycle
- `read_chunk(n)` — pull exactly `n` bytes (blocks up to `read_timeout`, pads silence otherwise)
- `read_seconds(s)` — convenience for "record `s` seconds"
- `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
#### `builtin_tts.py` (~70 lines, new 2026-04-21)
Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone).
**Exports:**
- `BuiltinTTS(audio_client, default_speaker_id=0)` — init
- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
#### `marcus_voice.py` (~340 lines, rewired 2026-04-21)
Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()``BuiltinTTS`.
**Exports:**
- `VoiceModule(audio_api, on_command=cb)` — init
- `start()` — spawn background thread
- `stop()` — graceful teardown
---
### Server/ ### Server/
#### `marcus_server.py` (224 lines) #### `marcus_server.py` (224 lines)

View File

@ -1,15 +1,16 @@
# Marcus — Control & Startup Guide # Marcus — Control & Startup Guide
**Updated**: 2026-04-06 **Robot persona:** Sanad (wake word + self-intro; project code lives under `Marcus/`)
**Updated**: 2026-04-21
--- ---
## Quick Start ## Quick Start
### Prerequisites (Jetson Orin NX) ### Prerequisites (Jetson Orin NX, JetPack 5.1.1)
```bash ```bash
# Terminal 1 — Start Holosoma (locomotion policy) # Terminal 1 — Start Holosoma (locomotion policy, in hsinference env)
source ~/.holosoma_deps/miniconda3/bin/activate hsinference source ~/.holosoma_deps/miniconda3/bin/activate hsinference
cd ~/holosoma cd ~/holosoma
~/.holosoma_deps/miniconda3/envs/hsinference/bin/python3 \ ~/.holosoma_deps/miniconda3/envs/hsinference/bin/python3 \
@ -19,28 +20,46 @@ cd ~/holosoma
--task.velocity-input zmq \ --task.velocity-input zmq \
--task.state-input zmq \ --task.state-input zmq \
--task.interface eth0 --task.interface eth0
# Terminal 2 — Ollama server (leave running)
ollama serve > /tmp/ollama.log 2>&1 &
sleep 3
ollama list # confirm qwen2.5vl:3b present
``` ```
### Option A — Terminal Mode (on Jetson) ### Option A — Terminal Mode (on Jetson)
```bash ```bash
# Terminal 2 — Start Marcus Brain # Terminal 3 — Start Marcus Brain
conda activate Marcus conda activate marcus
ollama serve & sleep 3
cd ~/Marcus cd ~/Marcus
python3 run_marcus.py python3 run_marcus.py
``` ```
Direct keyboard control. All commands typed locally. Direct keyboard control + voice input (say **"Sanad"** to wake). Expected banner on boot:
```
================================================
SANAD AI BRAIN — READY
================================================
model : qwen2.5vl:3b
yolo : True
odometry : True
memory : True
lidar : True
voice : True
camera : 424x240@15
```
### Option B — Server + Client (remote) ### Option B — Server + Client (remote)
```bash ```bash
# Terminal 2 (Jetson) — Start Server # Terminal 3 (Jetson) — Start Server
conda activate marcus
cd ~/Marcus cd ~/Marcus
python3 -m Server.marcus_server python3 -m Server.marcus_server
# Terminal 3 (Workstation) — Connect Client # Terminal 4 (Workstation) — Connect Client
cd ~/Robotics_workspace/yslootahtech/Project/Marcus cd ~/Robotics_workspace/yslootahtech/Project/Marcus
python3 -m Client.marcus_cli python3 -m Client.marcus_cli
``` ```
@ -58,6 +77,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
--- ---
## Voice
- **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`)
- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed.
- **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally.
- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
- **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command.
Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker.
To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster.
---
## Command Reference ## Command Reference
### Movement ### Movement
@ -75,17 +108,17 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
### Vision ### Vision
| Command | Action | | Command | Action |
|---------|--------| |---------|--------|
| `what do you see` | LLaVA describes camera view | | `what do you see` | Qwen2.5-VL describes camera view |
| `describe the room` | LLaVA scene description | | `describe the room` | Qwen2.5-VL scene description |
| `is anyone here` | LLaVA person check | | `is anyone here` | Qwen2.5-VL person check |
| `yolo` | Show YOLO detection status | | `yolo` | Show YOLO detection status |
### Goal Navigation ### Goal Navigation
| Command | Action | | Command | Action |
|---------|--------| |---------|--------|
| `goal/ stop when you see a person` | YOLO fast search + stop | | `goal/ stop when you see a person` | YOLO fast search + stop |
| `goal/ find a laptop` | YOLO + LLaVA search | | `goal/ find a laptop` | YOLO + Qwen-VL search |
| `goal/ stop when you see a guy holding a phone` | YOLO + LLaVA compound verification | | `goal/ stop when you see a guy holding a phone` | YOLO + Qwen-VL compound verification |
| `find a person` | Auto-detected as goal (no prefix needed) | | `find a person` | Auto-detected as goal (no prefix needed) |
| `look for a bottle` | Auto-detected as goal | | `look for a bottle` | Auto-detected as goal |
@ -106,7 +139,7 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
| `patrol` | Autonomous patrol (prompts for duration) | | `patrol` | Autonomous patrol (prompts for duration) |
| `patrol: door → desk → exit` | Named waypoint patrol | | `patrol: door → desk → exit` | Named waypoint patrol |
### Image Search ### Image Search (requires `subsystems.imgsearch: true`)
| Command | Action | | Command | Action |
|---------|--------| |---------|--------|
| `search/ /path/to/photo.jpg` | Find target from reference image | | `search/ /path/to/photo.jpg` | Find target from reference image |
@ -122,11 +155,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
| `last session` | Previous session summary | | `last session` | Previous session summary |
| `session summary` | Current session stats | | `session summary` | Current session stats |
### Autonomous Mode
| Command | Action |
|---------|--------|
| `auto on` | Start autonomous exploration |
| `auto off` | Stop |
| `auto status` | Current step / observations |
| `auto save` | Snapshot observations to disk |
### System ### System
| Command | Action | | Command | Action |
|---------|--------| |---------|--------|
| `help` | Command reference | | `help` | Command reference |
| `example` | Usage examples | | `example` | Usage examples |
| `lidar` / `lidar status` | SLAM engine pose + health |
| `q` / `quit` | Shutdown | | `q` / `quit` | Shutdown |
### Client-Only Commands (CLI) ### Client-Only Commands (CLI)
@ -139,35 +181,43 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
--- ---
## Subsystem flags (`Config/config_Brain.json`)
Control what initializes at boot. Defaults:
```jsonc
"subsystems": {
"lidar": true,
"voice": true,
"imgsearch": false,
"autonomous": true
}
```
Set any to `false` to skip that subsystem's init. Boot time drops roughly:
- `voice: false` → ~2 s faster (no Whisper model load)
- `lidar: false` → ~1 s faster (no SLAM subprocess spawn)
- `imgsearch: false` → already the default; re-enable only when you need `search/ …`
- `autonomous: false` → minor, but removes the AutonomousMode init
---
## Network Configuration ## Network Configuration
| Interface | IP | Use | | Interface | IP | Use |
|-----------|-----|------| |-----------|-----|------|
| `eth0` | 192.168.123.164 | Robot internal network (Jetson - G1 - LiDAR) | | `eth0` | 192.168.123.164 | Robot internal network (Jetson ↔ G1 ↔ LiDAR) |
| `wlan0` | 10.255.254.86 | Office WiFi (Jetson - Workstation) | | `wlan0` | 10.255.254.86 | Office WiFi (Jetson Workstation) |
| Service | Port | Protocol | | Service | Port | Protocol |
|---------|------|----------| |---------|------|----------|
| Marcus WebSocket | 8765 | ws:// | | Marcus WebSocket | 8765 | ws:// |
| ZMQ Velocity | 5556 | tcp:// (PUB/SUB) | | ZMQ velocity (→ Holosoma) | 5556 | tcp:// (PUB/SUB) |
| Ollama API | 11434 | HTTP | | Ollama API | 11434 | HTTP (localhost only) |
| LiDAR | 192.168.123.120 | Livox Mid360 | | G1 audio multicast (mic) | 5555 | UDP multicast 239.168.123.161 |
| Livox Mid-360 (LiDAR) | 192.168.123.120 | UDP (Livox SDK) |
All configurable in `Config/config_Network.json`. Most values configurable in `Config/config_Network.json` and `config_Voice.json::mic_udp`.
---
## Subsystem Status
On startup, the server/brain shows:
```
YOLO : active (19 tracked classes, CPU, yolov8m.pt)
Odometry : active (dead reckoning, +/-10cm)
Memory : active (session_016_2026-04-06)
Camera : 424x240@15 (RealSense D435I)
LiDAR : ALIVE (Livox Mid360 at 192.168.123.120)
Arms : pending (GR00T N1.5 not yet integrated)
```
--- ---
@ -175,13 +225,15 @@ Arms : pending (GR00T N1.5 not yet integrated)
| Issue | Cause | Fix | | Issue | Cause | Fix |
|-------|-------|-----| |-------|-------|-----|
| `ModuleNotFoundError: No module named 'Server'` | Wrong directory | `cd ~/Marcus` then run | | Banner shows `SANAD AI BRAIN — READY` but nothing moves | Holosoma not running | Start Holosoma (Terminal 1) first |
| Robot doesn't move | Holosoma not running | Start Holosoma first (Terminal 1) | | `RuntimeError: CUDA not available` on boot | Wrong torch build on Jetson | See `Doc/environment.md` section 9.2 — reinstall the NVIDIA Jetson torch wheel |
| Robot doesn't move | ZMQ port conflict | Only run one of Server or Brain, not both | | `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
| `Camera: {e} reconnecting` | USB bandwidth | Reduce to `low` profile | | Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
| LLaVA slow (>10s) | GPU VRAM full | Kill other GPU processes, or use `qwen2.5vl:3b` | | `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~1015 s on first Qwen load; subsequent commands are fast |
| `YOLO not available` | ultralytics not installed | `pip install ultralytics` | | Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` |
| Client can't connect | Wrong IP or server not running | Check `status` command, verify IP | | Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
| `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
| Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |
--- ---
@ -191,6 +243,7 @@ Arms : pending (GR00T N1.5 not yet integrated)
|------|------| |------|------|
| Brain code | `~/Marcus/Brain/` | | Brain code | `~/Marcus/Brain/` |
| Server | `~/Marcus/Server/marcus_server.py` | | Server | `~/Marcus/Server/marcus_server.py` |
| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` |
| Config | `~/Marcus/Config/` | | Config | `~/Marcus/Config/` |
| Prompts | `~/Marcus/Config/marcus_prompts.yaml` | | Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
| YOLO model | `~/Marcus/Models/yolov8m.pt` | | YOLO model | `~/Marcus/Models/yolov8m.pt` |
@ -199,3 +252,5 @@ Arms : pending (GR00T N1.5 not yet integrated)
| Logs | `~/Marcus/logs/` | | Logs | `~/Marcus/logs/` |
See `Doc/architecture.md` for full project structure and file-by-file documentation. See `Doc/architecture.md` for full project structure and file-by-file documentation.
See `Doc/environment.md` for the verified Jetson software stack.
See `Doc/pipeline.md` for the end-to-end data flow.

View File

@ -1,10 +1,11 @@
# Marcus — Environment & Version Reference # Marcus — Environment & Version Reference
**Project**: Marcus | YS Lootah Technology **Project**: Marcus | YS Lootah Technology
**Robot persona**: Sanad (wake word + self-intro; codebase stays under `Marcus/`)
**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB **Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB
**Deployment host**: `unitree@192.168.123.164` (hostname `ubuntu`) **Deployment host**: `unitree@192.168.123.164` (hostname `ubuntu`)
**Conda env**: `marcus` **Conda env**: `marcus`
**Captured**: 2026-04-12 **Captured**: 2026-04-12 (updated 2026-04-21)
This document is the canonical record of the verified GPU-accelerated software stack running on the Jetson Orin NX. It covers system software, Python environment, Marcus runtime dependencies, installation recipe, verification commands, and known quirks. Pair it with `architecture.md` (what the code does) and `controlling.md` (how to drive it). This document is the canonical record of the verified GPU-accelerated software stack running on the Jetson Orin NX. It covers system software, Python environment, Marcus runtime dependencies, installation recipe, verification commands, and known quirks. Pair it with `architecture.md` (what the code does) and `controlling.md` (how to drive it).
@ -136,29 +137,23 @@ Captured from `importlib` on 2026-04-12, `marcus` env on the Jetson.
## 8. Marcus project modules — import status ## 8. Marcus project modules — import status
All 16 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`: All 25 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`:
``` ```
OK Core.config_loader OK Core.config_loader Core.env_loader
OK Core.env_loader OK Core.log_backend Core.logger
OK Vision.marcus_yolo OK Voice.builtin_mic Voice.builtin_tts Voice.marcus_voice
OK Vision.marcus_imgsearch OK Vision.marcus_yolo Vision.marcus_imgsearch
OK API.llava_api OK API.llava_api API.yolo_api API.camera_api
OK API.yolo_api OK API.zmq_api API.imgsearch_api API.odometry_api
OK API.camera_api OK API.memory_api API.arm_api API.audio_api
OK API.zmq_api OK Navigation.goal_nav Navigation.patrol Navigation.marcus_odometry
OK API.imgsearch_api OK Brain.marcus_brain Brain.marcus_memory Brain.command_parser
OK API.odometry_api
OK API.memory_api
OK API.arm_api
OK Navigation.goal_nav
OK Navigation.patrol
OK Navigation.marcus_odometry
OK Brain.marcus_brain
OK Brain.marcus_memory
OK Autonomous.marcus_autonomous OK Autonomous.marcus_autonomous
``` ```
Notable removals: `Voice/marcus_gemini_voice.py` deleted on 2026-04-21. `Core/Logger.py` renamed to `Core/log_backend.py`.
--- ---
## 9. Installation recipe (reproducing this environment) ## 9. Installation recipe (reproducing this environment)
@ -378,3 +373,7 @@ Config file (`Config/config_Vision.json`):
| 2026-04-12 | Initial environment.md — full stack captured, GPU bring-up verified end to end. Steady-state YOLOv8m FPS on Orin NX measured at 21.9. Ollama Qwen2.5-VL verified at 100% GPU. | | 2026-04-12 | Initial environment.md — full stack captured, GPU bring-up verified end to end. Steady-state YOLOv8m FPS on Orin NX measured at 21.9. Ollama Qwen2.5-VL verified at 100% GPU. |
| 2026-04-12 | `Vision/marcus_yolo.py` rewired to load `config_Vision.json`, added `_resolve_device()` with hard-fail on missing CUDA (GPU-only policy). `API/yolo_api.py` updated to propagate `RuntimeError`. `Config/config_Vision.json` set `yolo_device=cuda`, `yolo_half=true`. | | 2026-04-12 | `Vision/marcus_yolo.py` rewired to load `config_Vision.json`, added `_resolve_device()` with hard-fail on missing CUDA (GPU-only policy). `API/yolo_api.py` updated to propagate `RuntimeError`. `Config/config_Vision.json` set `yolo_device=cuda`, `yolo_half=true`. |
| 2026-04-12 | Installed NVIDIA Jetson torch `2.1.0a0+41361538.nv23.06` (replacing CPU-only PyPI `2.4.1`) + built torchvision `0.16.1` from source against it. Verified `nms device = cuda:0`. | | 2026-04-12 | Installed NVIDIA Jetson torch `2.1.0a0+41361538.nv23.06` (replacing CPU-only PyPI `2.4.1`) + built torchvision `0.16.1` from source against it. Verified `nms device = cuda:0`. |
| 2026-04-12 | Fixed llama.cpp compute-graph OOM on Jetson: added `num_batch=128` + `num_ctx=2048` caps in `Config/config_Brain.json`, propagated through `API/llava_api.py` and `Vision/marcus_imgsearch.py`. Qwen2.5-VL compute graph drops from ~7.5 GiB to ~1.8 GiB. |
| 2026-04-21 | **Restructure**: moved ZMQ bind out of `API/zmq_api.py` import time into `init_zmq()`; fixes LiDAR SLAM worker spawn crash. Added loud GPU-requirement banner in `API/yolo_api.py`. Dropped `num_predict_main` 200→120. Made inner-loop sleeps in goal_nav/autonomous/imgsearch conditional. Renamed `Core/Logger.py``Core/log_backend.py` (case-collision fix). Updated `Doc/MARCUS_API.md` to current state. |
| 2026-04-21 | **Voice restructure**: added `Voice/builtin_mic.py` (G1 array mic via UDP multicast `239.168.123.161:5555`) and `Voice/builtin_tts.py` (thin `AudioClient.TtsMaker` wrapper). Rewired `Voice/marcus_voice.py` to use BuiltinMic. Refactored `API/audio_api.py::speak()` to use BuiltinTTS — removed ~110 lines of edge-tts + pydub + Piper plumbing. Deleted `Voice/marcus_gemini_voice.py`. Added `subsystems.{lidar,voice,imgsearch,autonomous}` gate in `config_Brain.json::init_brain()`. |
| 2026-04-21 | **Persona swap**: robot identifies as Sanad. Wake words `["sanad","sannad","sanat","sunnat"]`, `speaker.app_name="sanad"`, all Qwen prompts say "You are Sanad", banner reads `SANAD AI BRAIN — READY`, hardcoded self-intro says "I am Sanad". Project directory, class names, filenames, and `PROJECT_NAME=Marcus` env var unchanged. |

View File

@ -38,15 +38,12 @@ rm ~/Robotics_workspace/yslootahtech/Project/Marcus_fine_tune/marcus-gguf/marcus
https://github.com/AnjieCheng/NaVILA
https://ingrid789.github.io/SkillMimic/ https://rchalyang.github.io/EgoVLA/
https://github.com/wyhuai/SkillMimic https://github.com/RchalYang/EgoVLA_Release
https://github.com/openvla/openvla
https://vla-survey.github.io/ https://github.com/unitreerobotics/unifolm-vla
https://github.com/OpenDriveLab/WholebodyVLA

187
Doc/pipeline.md Normal file
View File

@ -0,0 +1,187 @@
# Marcus — End-to-End Pipeline
**Robot persona:** Sanad (wake word + self-intro)
**Updated:** 2026-04-21
One map of every data path from sensor to motor, voice to speech. Cross-reference with `architecture.md` (what each file is) and `MARCUS_API.md` (function signatures).
---
## Boot sequence
`Brain/marcus_brain.py::init_brain()` — called once from `run_marcus.py` or `marcus_server.py`.
```
run_marcus.py
init_brain()
├─ init_zmq() PUB bind tcp://127.0.0.1:5556 → Holosoma
├─ start_camera() RealSense 424×240@15fps → shared _raw_frame
├─ init_yolo(raw_frame, raw_lock) YOLOv8m CUDA FP16, 19 classes — background thread
├─ init_odometry() ROS2 /dog_odom → dead reckoning fallback
├─ init_memory() loads Data/Brain/Sessions/session_NNN/
├─ if subsystems.lidar: init_lidar() multiprocessing spawn SLAM_worker
├─ if subsystems.imgsearch: init_imgsearch() (off by default)
├─ if subsystems.autonomous: AutonomousMode() patrol state machine
├─ send_cmd("start") + 0.5s + send_cmd("walk") + 0.5s Holosoma handshake
├─ if subsystems.voice: _init_voice() ▼ voice pipeline below
└─ _warmup_llava() first Qwen2.5-VL inference
"SANAD AI BRAIN — READY"
```
Subsystem flags live in `config_Brain.json::subsystems`. Current defaults:
```json
"subsystems": { "lidar": true, "voice": true, "imgsearch": false, "autonomous": true }
```
---
## Voice pipeline (when `subsystems.voice = true`)
```
G1 body mic (array)
└─ UDP multicast 239.168.123.161:5555 ── int16 mono 16 kHz PCM
Voice/builtin_mic.py::BuiltinMic
ring buffer (64 KB) + read_chunk(n)
Voice/marcus_voice.py::VoiceModule (IDLE → WAKE_HEARD → PROCESSING → SPEAKING)
├─ IDLE : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…)
├─ WAKE_HEARD : audio_api.speak("Listening") → G1 body speaker
├─ PROCESSING : record-until-silence → Whisper small → transcribed text
└─ on_command(text, "en")
Brain/marcus_brain.py::process_command(text)
├─ regex fast-path → Brain/command_parser.py::try_local_command()
│ places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off
└─ else → _handle_llava(text)
├─ get_frame() (10×50 ms poll, no 1 s stall)
├─ API/llava_api.py::ask(text, img)
│ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
│ → parse_json() → {actions, arm, speak, abort}
└─ Brain/executor.py::execute(d)
├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma
├─ arm → API/arm_api.py (stub for now)
└─ abort → gradual_stop()
result["speak"] → audio_api.speak(reply)
API/audio_api.py::speak(text, lang="en")
├─ mute mic (flush BuiltinMic buffer)
├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
│ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only
│ time.sleep(len(text) * 0.08)
└─ unmute mic → back to IDLE
```
---
## Terminal / WebSocket command pipeline (same brain, skips voice)
```
run_marcus.py stdin OR Server/marcus_server.py WebSocket
Brain/marcus_brain.py::process_command(text)
▼ (same parser → LLaVA → executor → ZMQ as above)
result dict → stdout OR WebSocket reply frame
```
---
## Vision pipeline (continuous, consumed by brain on demand)
```
RealSense D435 (USB)
└─ 424×240 BGR 15 fps
→ API/camera_api.py — shared _raw_frame (thread-safe)
│ │
│ └─ get_frame() → JPEG base64 on demand
Vision/marcus_yolo.py (daemon thread)
YOLOv8m @ cuda:0 FP16 imgsz=320
→ _latest_detections (thread-safe list)
yolo_sees / yolo_closest / yolo_summary / yolo_fps
Navigation/goal_nav.py (fast YOLO check → Qwen-VL fallback)
Autonomous/marcus_autonomous.py (patrol scan every N steps)
Brain/marcus_brain.py (status / alerts)
```
---
## Movement pipeline
```
Brain/executor.py OR Brain/command_parser.py OR Navigation/*
│ uses MOVE_MAP from config_Navigation.json
API/zmq_api.py::send_vel(vx, vy, vyaw) JSON over ZMQ PUB (port 5556)
Holosoma RL policy (separate process, hsinference env)
G1 low-level joint commands over DDS/eth0
29-DOF body motion
```
---
## LiDAR pipeline (when `subsystems.lidar = true`)
```
Livox Mid-360 (192.168.123.120, UDP)
Lidar/SLAM_worker.py (multiprocessing.spawn subprocess — CUDA-safe spawn)
├─ SLAM_engine, SLAM_Filter, SLAM_LoopClosure, SLAM_Submap, SLAM_NavRuntime
├─ publishes pose + obstacle flags back to parent via Queue
└─ writes occupancy grids to Data/Navigation/Maps/
API/lidar_api.py (reads the queues, exposes:)
├─ obstacle_ahead() → bool
├─ get_lidar_status() → dict (pose, loc_state, frame age, FPS, ICP ms)
└─ LIDAR_AVAILABLE
Navigation/goal_nav.py rotation thread — pauses motion on obstacle_ahead()
Brain/command_parser.py — responds to "lidar status" queries
```
---
## Knobs that control each stage
| Knob | Location | Effect |
|---|---|---|
| `subsystems.lidar` | config_Brain.json | SLAM subprocess on/off |
| `subsystems.voice` | config_Brain.json | BuiltinMic + Whisper + TtsMaker loop on/off |
| `subsystems.imgsearch` | config_Brain.json | image-guided search init on/off |
| `subsystems.autonomous` | config_Brain.json | auto-patrol state machine init on/off |
| `num_batch`, `num_ctx` | config_Brain.json | llama.cpp compute-graph size (128 / 2048 ≈ 1.8 GiB graph — **do not raise** on 16 GB Jetson) |
| `num_predict_main` | config_Brain.json | 120 tokens max for the main JSON reply |
| `yolo_device`, `yolo_half` | config_Vision.json | `cuda` / FP16 (hard-required; CPU not allowed) |
| `mic.backend` | config_Voice.json | `builtin_udp` (G1 array) or `pactl_parec` (Hollyland fallback) |
| `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) |
| `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) |
---
## Per-command latency (estimated, post-fixes)
| Step | Typical | Notes |
|---|---|---|
| Wake-word detect | 200500 ms | Whisper tiny on 2 s chunk |
| Record until silence | 18 s | depends on user speech |
| Whisper small STT | 5001500 ms | once per command |
| Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
| Ollama Qwen2.5-VL | 8001500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
| Executor + ZMQ send | <10 ms | fire-and-forget PUB |
| TtsMaker playback | ~len(text) × 80 ms | synthesizes + plays on robot |
**Total wake → answer-playback:** ~**2.54 s** for a short vision question like "what do you see" (vs. 58 s with the pre-restructure edge-tts/Gemini overhead).

View File

@ -123,26 +123,36 @@ def navigate_to_goal(goal: str, max_steps: int = 0):
reached = False reached = False
try: try:
for step in range(1, max_steps + 1): for step in range(1, max_steps + 1):
time.sleep(SCAN_INTERVAL) # Track whether real work happened this iteration. If it did,
# the work itself already ate wall time — don't pay an extra
# SCAN_INTERVAL nap on top.
did_work = False
# --- YOLO fast check --- # --- YOLO fast check ---
if yolo_target and yolo_sees(yolo_target): if yolo_target and yolo_sees(yolo_target):
img_b64 = get_frame() img_b64 = get_frame()
did_work = True
if condition: if condition:
if not _verify_condition(yolo_target, condition, img_b64): if not _verify_condition(yolo_target, condition, img_b64):
print(f" [GoalNav] YOLO sees {yolo_target} but condition " print(f" [GoalNav] YOLO sees {yolo_target} but condition "
f"'{condition}' not met — continuing") f"'{condition}' not met — continuing")
continue # fall through to the sleep-skip path
else:
print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}") print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
log_detection(yolo_target, position="goal", distance="close") log_detection(yolo_target, position="goal", distance="close")
reached = True reached = True
break break
else:
print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
log_detection(yolo_target, position="goal", distance="close")
reached = True
break
# --- LLaVA fallback (less frequent — every few steps) --- # --- LLaVA fallback (less frequent — every few steps) ---
if step >= MIN_STEPS and step % MIN_STEPS == 0: if step >= MIN_STEPS and step % MIN_STEPS == 0:
img_b64 = get_frame() img_b64 = get_frame()
if img_b64: if img_b64:
did_work = True
d = ask_goal(goal, img_b64) d = ask_goal(goal, img_b64)
if d.get("reached"): if d.get("reached"):
print(f" [GoalNav] LLaVA says goal reached at step {step}") print(f" [GoalNav] LLaVA says goal reached at step {step}")
@ -152,6 +162,11 @@ def navigate_to_goal(goal: str, max_steps: int = 0):
if speak: if speak:
print(f" [GoalNav] LLaVA: {speak}") print(f" [GoalNav] LLaVA: {speak}")
# Only pay the scan interval when nothing happened this step.
# If YOLO hit or LLaVA fired, they already took 501000 ms.
if not did_work:
time.sleep(SCAN_INTERVAL)
finally: finally:
rotating[0] = False rotating[0] = False
rot_thread.join(timeout=1.0) rot_thread.join(timeout=1.0)

View File

@ -59,7 +59,9 @@ except ImportError:
# ══════════════════════════════════════════════════════════════════════════════ # ══════════════════════════════════════════════════════════════════════════════
DEFAULT_MAX_STEPS = 60 # max rotation steps before giving up DEFAULT_MAX_STEPS = 60 # max rotation steps before giving up
STEP_DELAY = 0.4 # seconds between YOLO checks STEP_DELAY = 0.15 # min gap between YOLO checks (was 0.4 — reduced
# because the rotation thread paces motion already
# and each LLaVA call is 600-1500 ms of real work)
ROTATE_SPEED = 0.25 # rad/s rotation speed during search ROTATE_SPEED = 0.25 # rad/s rotation speed during search
MIN_STEPS_WARMUP = 3 # skip first N steps (stale frame) MIN_STEPS_WARMUP = 3 # skip first N steps (stale frame)
MATCH_CONFIDENCE_THR = 0.6 # LLaVA confidence threshold (not used directly, MATCH_CONFIDENCE_THR = 0.6 # LLaVA confidence threshold (not used directly,

202
Voice/builtin_mic.py Normal file
View File

@ -0,0 +1,202 @@
"""
builtin_mic.py G1 built-in microphone (UDP multicast capture)
================================================================
The G1 humanoid's on-board microphone is published by the Unitree firmware
as an RTP-like UDP multicast stream on 239.168.123.161:5555, carrying
16 kHz mono int16 PCM. Any host on the robot's 192.168.123.0/24 network
can join the group and read the audio no extra SDK call required.
This module intentionally has no dependency on pyaudio, pulseaudio, or the
unitree_sdk2py package. Joining the multicast group is all that's needed.
Usage:
from Voice.builtin_mic import BuiltinMic
mic = BuiltinMic()
mic.start()
try:
chunk = mic.read_chunk(1024) # 512 samples, 32 ms at 16 kHz
...
finally:
mic.stop()
Ported from Project/Sanad/voice/audio_io.py (Sanad's production implementation).
"""
from __future__ import annotations
import socket
import struct
import subprocess
import threading
import time
from typing import Optional
DEFAULT_GROUP = "239.168.123.161"
DEFAULT_PORT = 5555
DEFAULT_BUF_MAX = 64_000 # ~2 s of 16 kHz mono int16
DEFAULT_READ_TIMEOUT = 0.04 # 40 ms budget per read_chunk call
SAMPLE_RATE = 16_000 # hardware rate — do not change
def _find_g1_local_ip() -> str:
"""
Return the host IPv4 on the G1's internal 192.168.123.0/24 network.
Required by IP_ADD_MEMBERSHIP so the kernel knows which NIC to join on.
"""
out = subprocess.run(
["ip", "-4", "-o", "addr"], capture_output=True, text=True,
).stdout
for line in out.splitlines():
for tok in line.split():
if tok.startswith("192.168.123."):
return tok.split("/")[0]
raise RuntimeError(
"BuiltinMic: no interface on 192.168.123.0/24 — "
"host is not on the G1's internal network"
)
class BuiltinMic:
"""
G1 on-board microphone over UDP multicast.
Thread-safe: a background daemon thread receives datagrams into an
internal ring buffer; `read_chunk(n)` pulls the next `n` bytes or
blocks up to `read_timeout` before returning zeros.
"""
sample_rate = SAMPLE_RATE
def __init__(
self,
group: str = DEFAULT_GROUP,
port: int = DEFAULT_PORT,
buf_max: int = DEFAULT_BUF_MAX,
read_timeout: float = DEFAULT_READ_TIMEOUT,
):
self._group = group
self._port = port
self._buf_max = buf_max
self._read_timeout = read_timeout
self._sock: Optional[socket.socket] = None
self._buf = bytearray()
self._lock = threading.Lock()
self._running = False
self._thread: Optional[threading.Thread] = None
def start(self) -> None:
if self._running:
return
local_ip = _find_g1_local_ip()
self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self._sock.bind(("", self._port))
mreq = struct.pack(
"4s4s",
socket.inet_aton(self._group),
socket.inet_aton(local_ip),
)
self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
self._sock.settimeout(1.0)
self._running = True
self._thread = threading.Thread(
target=self._recv_loop, daemon=True, name="builtin_mic_rx",
)
self._thread.start()
print(f" [BuiltinMic] joined {self._group}:{self._port} on {local_ip}")
def _recv_loop(self) -> None:
while self._running:
try:
data, _ = self._sock.recvfrom(4096)
with self._lock:
self._buf.extend(data)
# ring-buffer: drop oldest when we'd exceed buf_max
if len(self._buf) > self._buf_max:
del self._buf[: len(self._buf) - self._buf_max]
except socket.timeout:
continue
except Exception:
if self._running:
time.sleep(0.01)
def read_chunk(self, num_bytes: int) -> bytes:
"""
Return exactly `num_bytes` of 16 kHz mono int16 PCM.
Waits up to `read_timeout` for that many bytes to be available.
If the buffer is still short after the timeout, returns whatever
is available padded with silence. Never blocks forever.
"""
deadline = time.time() + self._read_timeout
while time.time() < deadline:
with self._lock:
if len(self._buf) >= num_bytes:
chunk = bytes(self._buf[:num_bytes])
del self._buf[:num_bytes]
return chunk
time.sleep(0.003)
with self._lock:
avail = len(self._buf)
if avail > 0:
chunk = bytes(self._buf[:avail])
del self._buf[:avail]
return chunk + b"\x00" * (num_bytes - avail)
return b"\x00" * num_bytes
def read_seconds(self, seconds: float) -> bytes:
"""
Convenience: capture `seconds` of audio and return as bytes.
Blocks for the full duration (not a real-time producer).
"""
num_bytes = int(seconds * self.sample_rate * 2) # 2 bytes/sample (int16)
out = bytearray()
chunk_bytes = 1024
while len(out) < num_bytes:
out.extend(self.read_chunk(min(chunk_bytes, num_bytes - len(out))))
return bytes(out)
def flush(self) -> None:
"""Drop all buffered audio (e.g. after the robot spoke)."""
with self._lock:
self._buf.clear()
def stop(self) -> None:
self._running = False
if self._sock is not None:
try:
self._sock.close()
except Exception:
pass
self._sock = None
if self._thread is not None:
self._thread.join(timeout=1.5)
self._thread = None
# ────────────────────────────────────────────────────────────────
# Standalone test — capture 3 s and print energy stats
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import array
print("BuiltinMic standalone test — capturing 3 s from G1...")
mic = BuiltinMic()
mic.start()
time.sleep(0.3) # let the receiver thread warm up
raw = mic.read_seconds(3.0)
mic.stop()
samples = array.array("h", raw)
if not samples:
print(" FAIL — got zero samples")
else:
mn = min(samples); mx = max(samples)
mean_abs = sum(abs(s) for s in samples) / len(samples)
print(f" samples={len(samples)} min={mn} max={mx} mean|s|={mean_abs:.0f}")
if mean_abs > 30:
print(" OK — mic is capturing audio")
else:
print(" WARN — signal very low, check G1 audio service is running")

88
Voice/builtin_tts.py Normal file
View File

@ -0,0 +1,88 @@
"""
builtin_tts.py Unitree G1 built-in TTS (English only)
========================================================
Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
TTS engine synthesizes and plays directly through the body speaker no
internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.
Supported languages (firmware-side):
English works (Marcus uses this)
Chinese works (unused)
Arabic silently falls back to Chinese (unusable we refuse these)
Signature:
client.TtsMaker(text: str, speaker_id: int) -> int # 0 = success
speaker_id {0, 1, 2} different voice timbres
Usage:
from Voice.builtin_tts import BuiltinTTS
tts = BuiltinTTS(audio_client)
tts.speak("Hello, I am Sanad", speaker_id=0)
"""
from __future__ import annotations
import logging
import time
from typing import Optional
log = logging.getLogger("builtin_tts")
class BuiltinTTS:
"""Synchronous English-only TTS via the G1's on-board engine."""
# Rough playback duration per character — enough margin that `speak()`
# returns after audio has actually finished on the robot.
SECONDS_PER_CHAR = 0.08
MIN_SECONDS = 1.5
def __init__(self, audio_client, default_speaker_id: int = 0):
"""
Args:
audio_client : initialized unitree_sdk2py AudioClient
default_speaker_id : 0, 1, or 2 (default voice timbre)
"""
self._client = audio_client
self._default_speaker = default_speaker_id
def speak(
self,
text: str,
speaker_id: Optional[int] = None,
block: bool = True,
) -> int:
"""
Play `text` on the G1 speaker via TtsMaker.
English-only by policy. Non-ASCII (Arabic) input is rejected rather
than silently played back as Chinese. Returns the TtsMaker status
code (0 = success) or -1 if input was rejected.
"""
if not text or not text.strip():
return -1
# Reject non-English. TtsMaker "falls back" by playing Arabic text
# as Chinese phonemes — intelligible to nobody — so we refuse it
# rather than surprise the operator.
if any(ord(c) > 127 for c in text):
log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
return -1
sid = self._default_speaker if speaker_id is None else speaker_id
log.info("[TtsMaker sid=%d] %s", sid, text[:80])
try:
code = self._client.TtsMaker(text, sid)
except Exception as e:
log.error("TtsMaker call failed: %s", e)
return -1
if block:
# Estimate how long the G1 is going to take to finish speaking.
# TtsMaker is fire-and-forget — we need to wait so the mic loop
# knows when to unmute.
duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
time.sleep(duration)
return code

View File

@ -1,608 +0,0 @@
#!/usr/bin/env python3
"""
Voice/marcus_gemini_voice.py Marcus Gemini Live Voice Module v2
==================================================================
Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio.
Uses G1 built-in speaker + Hollyland wireless mic.
Based on SanadVoice/gemini_interact architecture:
- PyAudio for mic (not parec)
- Echo suppression (silence when speaking)
- Gemini VAD (automatic activity detection)
- thinkingBudget=0 (no thinking text)
- ASR buffering for full sentences
- Vision routed to brain's Qwen camera
Usage:
from Voice.marcus_gemini_voice import GeminiVoiceModule
voice = GeminiVoiceModule(audio_api, on_transcript=callback)
voice.start()
"""
import array
import asyncio
import base64
import json
import logging
import os
import subprocess
import threading
import time
import numpy as np
from dotenv import load_dotenv
load_dotenv()
BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
PROJECT_NAME = "Marcus"
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
logging.StreamHandler(),
],
)
log = logging.getLogger("gemini_voice")
def load_config(name: str) -> dict:
path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
with open(path, "r") as f:
return json.load(f)
# ─── CONFIGURATION ────────────────────────────────────────
API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8"
MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
URI = (
"wss://generativelanguage.googleapis.com/ws/"
"google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
f"?key={API_KEY}"
)
VOICE_NAME = "Charon"
SEND_RATE = 16000
RECEIVE_RATE = 24000
CHUNK_SIZE = 512
CHANNELS = 1
def load_system_prompt():
paths = [
os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"),
]
for p in paths:
if os.path.exists(p):
with open(p, "r", encoding="utf-8-sig") as f:
return f.read().strip()
return (
"You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. "
"Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max."
)
# ─── AUDIO HELPERS ────────────────────────────────────────
def audio_energy(pcm: bytes) -> int:
try:
samples = array.array("h", pcm)
if not samples:
return 0
return sum(abs(s) for s in samples) // len(samples)
except Exception:
return 0
SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2)
# ─── GEMINI VOICE MODULE ─────────────────────────────────
class GeminiVoiceModule:
"""Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic."""
def __init__(self, audio_api, on_transcript=None):
self._audio = audio_api
self._on_transcript = on_transcript
self._config = load_config("Voice")
self._mic_source = getattr(audio_api, '_mic_source',
self._config["mic"].get("source_index", "0"))
# State
self.speaking = False
self.interrupted = False
self._running = False
self._thread = None
self._audio_queue = None # Created in async context
# Tuning
self.MIN_THRESHOLD = 3000
self.barge_in_threshold = self.MIN_THRESHOLD
self.REQUIRED_LOUD_CHUNKS = 10
self.PREBUFFER_CHUNKS = 2
self.PLAYBACK_TIMEOUT = 0.25
self.BARGE_IN_COOLDOWN = 0.7
self.AI_SPEAK_GRACE = 0.20
self.ECHO_GUARD_SEC = 0.8
self.SPEAKING_ENERGY_GATE = 0.85
self.SEND_SILENCE_WHEN_SPEAKING = True
# Timing
self._ai_speaking_since = 0.0
self._last_ai_audio_time = 0.0
self._barge_in_block_until = 0.0
self._ignore_input_until = 0.0
# ASR buffer
self._asr_buf = ""
self._asr_last_time = 0.0
self.ASR_WINDOW_SEC = 2.0
# Find Hollyland mic PyAudio device index
self._mic_device_idx = self._find_mic_device()
log.info("GeminiVoiceModule v2 initialized")
# ─── MIC DEVICE DETECTION ─────────────────────────────
def _find_mic_device(self) -> int:
"""Find Hollyland wireless mic in PyAudio devices. Returns device index."""
import pyaudio
import ctypes
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
def _alsa_error_handler(filename, line, function, err, fmt):
pass # suppress
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
try:
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
asound.snd_lib_error_set_handler(c_error_handler)
except: pass # ALSA_suppress
pa = pyaudio.PyAudio()
try:
# First: set PulseAudio default source to Hollyland
subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
# Search for wireless mic by name
for i in range(pa.get_device_count()):
info = pa.get_device_info_by_index(i)
name = info.get("name", "").lower()
if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name):
log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"]))
return i
# Fallback to 'default' or 'pulse' device
for i in range(pa.get_device_count()):
info = pa.get_device_info_by_index(i)
if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"):
log.info("Mic fallback: [%d] %s", i, info["name"])
return i
log.warning("No mic found, using device 0")
return 0
finally:
pa.terminate()
# ─── MIC CALIBRATION ──────────────────────────────────
def _calibrate_mic(self):
"""Calibrate barge-in threshold from ambient noise."""
import pyaudio
import ctypes
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
def _alsa_error_handler(filename, line, function, err, fmt):
pass # suppress
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
try:
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
asound.snd_lib_error_set_handler(c_error_handler)
except: pass # ALSA_suppress
pa = pyaudio.PyAudio()
mic_info = pa.get_device_info_by_index(self._mic_device_idx)
mic_rate = int(mic_info["defaultSampleRate"])
mic_channels = 1
try:
stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
rate=mic_rate, input=True,
input_device_index=self._mic_device_idx,
frames_per_buffer=CHUNK_SIZE)
values = []
for _ in range(40):
data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
values.append(audio_energy(data))
stream.stop_stream()
stream.close()
avg_noise = sum(values) / len(values) if values else 0
self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold)
except Exception as e:
log.warning("Calibration failed: %s", e)
finally:
pa.terminate()
# ─── G1 SPEAKER PLAYBACK ─────────────────────────────
def _play_buffer_on_g1(self, pcm_24k: np.ndarray):
"""Play 24kHz audio on G1 speaker (resample to 16kHz, single call)."""
if len(pcm_24k) < 100:
return
# Resample 24kHz → 16kHz
tl = int(len(pcm_24k) * 16000 / 24000)
audio_16k = np.interp(
np.linspace(0, len(pcm_24k), tl, endpoint=False),
np.arange(len(pcm_24k)),
pcm_24k.astype(np.float64),
).astype(np.int16)
from unitree_sdk2py.g1.audio.g1_audio_api import (
ROBOT_API_ID_AUDIO_START_PLAY,
ROBOT_API_ID_AUDIO_STOP_PLAY,
)
client = self._audio._client
if not client:
return
app_name = "gemini"
client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
time.sleep(0.1)
pcm = audio_16k.tobytes()
sid = f"s_{int(time.time() * 1000)}"
param = json.dumps({
"app_name": app_name,
"stream_id": sid,
"sample_rate": 16000,
"channels": 1,
"bits_per_sample": 16,
})
client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm))
duration = len(audio_16k) / 16000
time.sleep(duration + 0.3)
client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
# ─── WEBSOCKET TASKS ─────────────────────────────────
async def _capture_mic(self, ws):
"""Continuously capture mic via PyAudio and send to Gemini."""
import pyaudio
import ctypes
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
def _alsa_error_handler(filename, line, function, err, fmt):
pass # suppress
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
try:
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
asound.snd_lib_error_set_handler(c_error_handler)
except: pass # ALSA_suppress
pa = pyaudio.PyAudio()
mic_info = pa.get_device_info_by_index(self._mic_device_idx)
mic_rate = int(mic_info["defaultSampleRate"])
mic_channels = 1
# Open mic at native rate/channels
stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
rate=mic_rate, input=True,
input_device_index=self._mic_device_idx,
frames_per_buffer=CHUNK_SIZE)
log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels)
loud_chunks = 0
loop = asyncio.get_event_loop()
needs_resample = mic_rate != SEND_RATE or mic_channels != 1
try:
while self._running:
data = await loop.run_in_executor(
None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False))
# Convert to mono 16kHz if needed
if needs_resample:
audio = np.frombuffer(data, dtype=np.int16)
# Stereo to mono
if mic_channels == 2:
audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16)
# Resample to 16kHz
if mic_rate != SEND_RATE:
tl = int(len(audio) * SEND_RATE / mic_rate)
if tl > 0:
audio = np.interp(
np.linspace(0, len(audio), tl, endpoint=False),
np.arange(len(audio)),
audio.astype(np.float64),
).astype(np.int16)
data = audio.tobytes()
energy = audio_energy(data)
now = time.time()
# Barge-in detection
if self.speaking and now >= self._barge_in_block_until:
if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
if energy > self.barge_in_threshold:
loud_chunks += 1
else:
loud_chunks = 0
if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
log.info("Barge-in detected!")
self.interrupted = True
self.speaking = False
while not self._audio_queue.empty():
try: self._audio_queue.get_nowait()
except: break
loud_chunks = 0
self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
# Echo suppression: send silence while speaking
data_to_send = data
if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
if energy < gate:
data_to_send = SILENCE_PCM
# Send to Gemini
b64 = base64.b64encode(data_to_send).decode()
msg = {
"realtime_input": {
"media_chunks": [
{"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64}
]
}
}
await ws.send(json.dumps(msg))
except Exception as e:
if self._running:
log.error("Mic error: %s", e)
finally:
stream.stop_stream()
stream.close()
pa.terminate()
async def _receive_audio(self, ws):
"""Receive audio responses and transcriptions from Gemini."""
async for msg in ws:
if not self._running:
break
try:
response = json.loads(msg)
server_content = response.get("serverContent", {})
if server_content.get("interrupted"):
self.interrupted = False
# User transcription (partial/streaming)
input_tr = (
server_content.get("inputTranscription")
or server_content.get("input_transcription")
or server_content.get("inputAudioTranscription")
or server_content.get("input_audio_transcription")
)
if isinstance(input_tr, dict):
text = (input_tr.get("text") or "").strip()
now = time.time()
if text and now >= self._ignore_input_until and not self.speaking:
# Buffer ASR text
if now - self._asr_last_time > self.ASR_WINDOW_SEC:
self._asr_buf = ""
self._asr_buf = text # Gemini sends cumulative transcription
self._asr_last_time = now
if self.interrupted:
continue
# Audio from Gemini
model_turn = server_content.get("modelTurn")
if model_turn:
for part in model_turn.get("parts", []):
inline_data = part.get("inlineData")
if inline_data:
audio_b64 = inline_data.get("data")
if audio_b64:
now = time.time()
if not self.speaking:
self._ai_speaking_since = now
# Gemini started responding — fire transcript callback
if self._asr_buf and self._on_transcript:
self._on_transcript(self._asr_buf, "user")
self.speaking = True
self._last_ai_audio_time = now
self._ignore_input_until = now + self.ECHO_GUARD_SEC
audio_bytes = base64.b64decode(audio_b64)
await self._audio_queue.put(audio_bytes)
# Text from Gemini (thinking/response text)
text_part = part.get("text", "").strip()
if text_part and self._on_transcript:
self._on_transcript(text_part, "marcus")
# Turn complete — Gemini finished speaking
turn_complete = server_content.get("turnComplete")
if turn_complete:
# Clear ASR buffer after turn
self._asr_buf = ""
except Exception as e:
log.error("Receive error: %s", e)
async def _play_audio(self):
"""Collect Gemini audio chunks and play on G1 speaker."""
while self._running:
try:
if not self.speaking:
await asyncio.sleep(0.05)
continue
# Pre-buffer
buffered = False
while self.speaking and not buffered:
if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS:
buffered = True
else:
await asyncio.sleep(0.01)
# Collect all audio chunks
buffer_chunks = []
while self.speaking:
try:
data = await asyncio.wait_for(
self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT)
audio = np.frombuffer(data, dtype=np.int16)
buffer_chunks.append(audio)
self._last_ai_audio_time = time.time()
except asyncio.TimeoutError:
if self._audio_queue.empty():
if time.time() - self._last_ai_audio_time > 0.3:
break
# Play on G1 speaker
if buffer_chunks:
full_audio = np.concatenate(buffer_chunks)
duration = len(full_audio) / RECEIVE_RATE
log.info("Playing %.1fs on G1", duration)
await asyncio.get_event_loop().run_in_executor(
None, self._play_buffer_on_g1, full_audio)
self.speaking = False
except Exception as e:
log.error("Play error: %s", e)
self.speaking = False
# ─── MAIN LOOP ────────────────────────────────────────
async def _run_async(self):
import websockets
import inspect
system_prompt = load_system_prompt()
# Unmute mic
subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
# Calibrate
self._calibrate_mic()
ws_kwargs = {"max_size": None}
try:
sig = inspect.signature(websockets.connect)
if "extra_headers" in sig.parameters:
ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
else:
ws_kwargs["additional_headers"] = {"Content-Type": "application/json"}
except Exception:
ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
while self._running:
try:
log.info("Connecting to Gemini...")
async with websockets.connect(URI, **ws_kwargs) as ws:
setup_msg = {
"setup": {
"model": MODEL,
"generationConfig": {
"responseModalities": ["AUDIO"],
"thinkingConfig": {"thinkingBudget": 0},
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": VOICE_NAME}
}
},
},
"realtimeInputConfig": {
"automaticActivityDetection": {
"startOfSpeechSensitivity": "START_SENSITIVITY_HIGH",
"prefixPaddingMs": 40,
"endOfSpeechSensitivity": "END_SENSITIVITY_HIGH",
"silenceDurationMs": 250,
}
},
"inputAudioTranscription": {},
"systemInstruction": {"parts": [{"text": system_prompt}]},
}
}
await ws.send(json.dumps(setup_msg))
await ws.recv()
log.info("Connected! Always listening...")
self._audio_queue = asyncio.Queue()
await asyncio.gather(
self._capture_mic(ws),
self._receive_audio(ws),
self._play_audio(),
)
except Exception as e:
if self._running:
log.error("Connection error: %s — reconnecting in 3s", e)
await asyncio.sleep(3)
def _voice_thread(self):
asyncio.run(self._run_async())
# ─── START / STOP ─────────────────────────────────────
def start(self):
if self._running:
return
self._running = True
self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice")
self._thread.start()
log.info("Gemini voice module started")
def stop(self):
self._running = False
if self._thread:
self._thread.join(timeout=5)
self._thread = None
log.info("Gemini voice module stopped")
@property
def is_running(self) -> bool:
return self._running
@property
def state(self) -> str:
return "LISTENING" if self._running else "STOPPED"
@property
def is_speaking(self) -> bool:
return self.speaking
# ─── STANDALONE TEST ─────────────────────────────────────
if __name__ == "__main__":
import sys
sys.path.insert(0, PROJECT_ROOT)
from API.audio_api import AudioAPI
def on_transcript(text, role):
print(f" [{role.upper()}] {text}")
audio = AudioAPI()
voice = GeminiVoiceModule(audio, on_transcript=on_transcript)
print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n")
voice.start()
try:
while voice.is_running:
time.sleep(0.5)
except KeyboardInterrupt:
print("\nStopping...")
voice.stop()

View File

@ -1,19 +1,20 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
Features/Voice/marcus_voice.py Marcus Always-Listening Voice Module Voice/marcus_voice.py Marcus Always-Listening Voice Module (English)
====================================================================== =======================================================================
State machine: State machine:
IDLE (wake word detected) WAKE_HEARD IDLE (wake word detected) WAKE_HEARD
WAKE_HEARD (record command) PROCESSING WAKE_HEARD (record command) PROCESSING
PROCESSING (Whisper transcribe) send to brain SPEAKING PROCESSING (Whisper transcribe) send to brain SPEAKING
SPEAKING (TTS done) IDLE SPEAKING (TTS done) IDLE
Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny) Wake word: "Marcus" (detected by Whisper tiny)
Commands: Transcribed by Whisper small Commands: Transcribed by Whisper small
TTS: Handled by API/audio_api.py Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
Usage: Usage:
from Features.Voice.marcus_voice import VoiceModule from Voice.marcus_voice import VoiceModule
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command) voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
voice.start() # background thread voice.start() # background thread
voice.stop() voice.stop()
@ -21,7 +22,6 @@ Usage:
import logging import logging
import os import os
import subprocess
import threading import threading
import time import time
import numpy as np import numpy as np
@ -74,7 +74,8 @@ class VoiceModule:
""" """
Args: Args:
audio_api: AudioAPI instance (from API/audio_api.py) audio_api: AudioAPI instance (from API/audio_api.py)
on_command: callback(text: str, lang: str) called when command is transcribed on_command: callback(text: str, lang: str) "lang" is always "en"
now; kept in the signature for interface stability.
""" """
self._audio = audio_api self._audio = audio_api
self._on_command = on_command self._on_command = on_command
@ -83,13 +84,23 @@ class VoiceModule:
self._stt = self._config["stt"] self._stt = self._config["stt"]
self._mic = self._config["mic"] self._mic = self._config["mic"]
# Whisper models — lazy loaded # Whisper models — lazy loaded on first _voice_loop() iteration
self._wake_model = None self._wake_model = None
self._cmd_model = None self._cmd_model = None
# Wake words # Wake words (English only — built-in TTS doesn't do Arabic)
self._wake_en = [w.lower() for w in self._stt["wake_words_en"]] self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
self._wake_ar = self._stt["wake_words_ar"] ["marcus", "marcos"])]
# G1 built-in mic (UDP multicast).
from Voice.builtin_mic import BuiltinMic
_mcfg = self._config.get("mic_udp", {})
self._mic_capture = BuiltinMic(
group=_mcfg.get("group", "239.168.123.161"),
port=_mcfg.get("port", 5555),
buf_max=_mcfg.get("buffer_max_bytes", 64000),
)
self._sample_rate = self._mic_capture.sample_rate # 16000
# State # State
self._state = State.IDLE self._state = State.IDLE
@ -97,7 +108,7 @@ class VoiceModule:
self._thread = None self._thread = None
self._lock = threading.Lock() self._lock = threading.Lock()
log.info("VoiceModule initialized") log.info("VoiceModule initialized (mic: G1 built-in UDP)")
# ─── MODEL LOADING ──────────────────────────────────── # ─── MODEL LOADING ────────────────────────────────────
@ -115,69 +126,49 @@ class VoiceModule:
self._cmd_model = whisper.load_model(self._stt["command_model"]) self._cmd_model = whisper.load_model(self._stt["command_model"])
log.info("Command model ready") log.info("Command model ready")
# ─── MIC RECORDING ──────────────────────────────────── # ─── MIC RECORDING (G1 built-in UDP) ──────────────────
def _record_chunk(self, seconds: float) -> np.ndarray: def _record_chunk(self, seconds: float) -> np.ndarray:
"""Record audio chunk from mic via parec.""" """Capture a fixed-duration chunk from the G1 built-in mic."""
source = self._mic["source_index"] num_bytes = int(seconds * self._sample_rate * 2) # int16 mono
rate = str(self._mic["rate"]) raw = bytearray()
bite = 1024
proc = subprocess.Popen( while len(raw) < num_bytes:
["parec", "-d", source, raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
"--format=s16le", f"--rate={rate}", "--channels=1", "--raw"], return np.frombuffer(bytes(raw), dtype=np.int16)
stdout=subprocess.PIPE,
)
time.sleep(seconds)
proc.terminate()
raw = proc.stdout.read()
return np.frombuffer(raw, dtype=np.int16)
def _record_until_silence(self) -> np.ndarray: def _record_until_silence(self) -> np.ndarray:
"""Record until silence is detected or max duration reached.""" """Capture until RMS drops below threshold for `silence_duration_sec`."""
source = self._mic["source_index"] threshold = self._stt.get("silence_threshold", 500)
rate = self._mic["rate"] silence_dur = self._stt.get("silence_duration_sec", 1.5)
threshold = self._stt["silence_threshold"] max_dur = self._stt.get("max_record_sec", 15)
silence_dur = self._stt["silence_duration_sec"]
max_dur = self._stt["max_record_sec"]
chunk_sec = 0.5 chunk_sec = 0.5
chunk_samples = int(rate * chunk_sec) chunk_bytes = int(self._sample_rate * chunk_sec) * 2
silence_chunks_needed = int(silence_dur / chunk_sec) silence_chunks_need = int(silence_dur / chunk_sec)
max_chunks = int(max_dur / chunk_sec) max_chunks = int(max_dur / chunk_sec)
proc = subprocess.Popen( all_audio = []
["parec", "-d", source,
"--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
stdout=subprocess.PIPE,
)
all_audio = []
silence_count = 0 silence_count = 0
chunk_count = 0 chunk_count = 0
try: while chunk_count < max_chunks:
while chunk_count < max_chunks: raw = self._mic_capture.read_chunk(chunk_bytes)
data = proc.stdout.read(chunk_samples * 2) # 2 bytes per sample if not raw:
if not data: break
break chunk = np.frombuffer(raw, dtype=np.int16)
all_audio.append(chunk)
chunk_count += 1
chunk = np.frombuffer(data, dtype=np.int16) rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
all_audio.append(chunk) if rms < threshold:
chunk_count += 1 silence_count += 1
else:
silence_count = 0
# Check for silence if silence_count >= silence_chunks_need and chunk_count > 2:
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2)) log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
if rms < threshold: break
silence_count += 1
else:
silence_count = 0
if silence_count >= silence_chunks_needed and chunk_count > 2:
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
break
finally:
proc.terminate()
proc.stdout.read() # drain
if all_audio: if all_audio:
return np.concatenate(all_audio) return np.concatenate(all_audio)
@ -205,38 +196,18 @@ class VoiceModule:
return text return text
def _check_wake_word(self, text: str) -> bool: def _check_wake_word(self, text: str) -> bool:
"""Check if transcribed text contains a wake word.""" """Check if transcribed text contains an English wake word."""
text_lower = text.lower().strip() text_lower = text.lower().strip()
return any(w in text_lower for w in self._wake_en)
# English wake words
for w in self._wake_en:
if w in text_lower:
return True
# Arabic wake words
for w in self._wake_ar:
if w in text:
return True
return False
# ─── MAIN LOOP ──────────────────────────────────────── # ─── MAIN LOOP ────────────────────────────────────────
def _voice_loop(self): def _voice_loop(self):
"""Main voice processing loop — runs in background thread.""" """Main voice processing loop — runs in background thread."""
self._load_whisper() self._load_whisper()
self._mic_capture.start()
log.info("Voice loop started — listening for wake word...") log.info("Voice loop started — listening for wake word...")
# Unmute mic once
subprocess.run(
["pactl", "set-source-mute", self._mic["source_index"], "0"],
capture_output=True,
)
subprocess.run(
["pactl", "set-source-volume", self._mic["source_index"], "100%"],
capture_output=True,
)
while self._running: while self._running:
try: try:
if self._state == State.IDLE: if self._state == State.IDLE:
@ -279,9 +250,7 @@ class VoiceModule:
self._state = State.WAKE_HEARD self._state = State.WAKE_HEARD
# Acknowledge # Acknowledge
self._audio.speak( self._audio.speak(self._config["messages"]["wake_heard"])
self._config["messages"]["wake_heard"], "en"
)
def _do_wake_heard(self): def _do_wake_heard(self):
"""Record the command until silence.""" """Record the command until silence."""
@ -294,7 +263,7 @@ class VoiceModule:
if len(audio) < 4000: # < 0.25s at 16kHz if len(audio) < 4000: # < 0.25s at 16kHz
log.info("Too short, ignoring") log.info("Too short, ignoring")
self._audio.speak(self._config["messages"]["no_speech"], "en") self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE self._state = State.IDLE
return return
@ -308,18 +277,16 @@ class VoiceModule:
if not text or len(text.strip()) < 2: if not text or len(text.strip()) < 2:
log.info("Empty transcription") log.info("Empty transcription")
self._audio.speak(self._config["messages"]["no_speech"], "en") self._audio.speak(self._config["messages"]["no_speech"])
self._state = State.IDLE self._state = State.IDLE
return return
# Detect language log.info("Command: %s", text)
lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
log.info("Command [%s]: %s", lang, text)
# Send to brain callback # Send to brain callback (lang always "en" in this build)
if self._on_command: if self._on_command:
try: try:
self._on_command(text, lang) self._on_command(text, "en")
except Exception as e: except Exception as e:
log.error("Brain callback error: %s", e) log.error("Brain callback error: %s", e)
@ -342,6 +309,10 @@ class VoiceModule:
def stop(self): def stop(self):
"""Stop voice listening.""" """Stop voice listening."""
self._running = False self._running = False
try:
self._mic_capture.stop()
except Exception:
pass
if self._thread: if self._thread:
self._thread.join(timeout=5) self._thread.join(timeout=5)
self._thread = None self._thread = None