Update 2026-04-21 16:10:00
This commit is contained in:
parent
8491be7f1e
commit
e0f6acd5c7
278
API/audio_api.py
278
API/audio_api.py
@ -2,19 +2,24 @@
|
||||
"""
|
||||
API/audio_api.py — Marcus Audio API Layer
|
||||
==========================================
|
||||
Provides speak() and record_audio() for the Brain layer.
|
||||
Provides speak() and record() for the Brain layer.
|
||||
Brain imports ONLY from this API — never from unitree SDK directly.
|
||||
|
||||
Speaker: _CallRequestWithParamAndBin (single call, full buffer)
|
||||
Mic: parec -d 3 (Hollyland wireless, PulseAudio source index from config)
|
||||
TTS EN: Unitree built-in TtsMaker
|
||||
TTS AR: Piper ar_JO-kareem-medium → resample → G1 speaker
|
||||
Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only,
|
||||
no MP3/WAV plumbing, no internet). Optional raw-PCM playback path
|
||||
via _play_pcm() is kept for future modules that synthesize their
|
||||
own audio (e.g. offline Piper).
|
||||
Mic: G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono).
|
||||
Legacy Hollyland/parec path retained as fallback when
|
||||
config_Voice.json has mic.backend="pactl_parec".
|
||||
TTS: English only. Arabic is rejected (the G1 firmware silently maps
|
||||
Arabic to Chinese, which confuses everyone — if Arabic TTS is ever
|
||||
needed again, use a separate offline backend like Piper).
|
||||
|
||||
Usage:
|
||||
from API.audio_api import AudioAPI
|
||||
audio = AudioAPI()
|
||||
audio.speak("Hello", "en")
|
||||
audio.speak("مرحبا", "ar")
|
||||
audio.speak("Hello, I am Sanad")
|
||||
recording = audio.record(seconds=5)
|
||||
audio.play_pcm(recording)
|
||||
"""
|
||||
@ -71,7 +76,24 @@ class AudioAPI:
|
||||
self._tts = self._config["tts"]
|
||||
self._mic = self._config["mic"]
|
||||
self._spk = self._config["speaker"]
|
||||
self._target_rate = self._tts["target_sample_rate"]
|
||||
self._target_rate = self._tts.get("target_sample_rate", 16000)
|
||||
|
||||
# Default mic backend: G1 built-in UDP multicast.
|
||||
# Set mic.backend="pactl_parec" in config_Voice.json to fall back
|
||||
# to the legacy Hollyland/PulseAudio path.
|
||||
self._mic_backend = self._mic.get("backend", "builtin_udp")
|
||||
self._builtin_mic = None # lazy-initialized on first record()
|
||||
|
||||
# Built-in TTS wrapper (uses the already-initialized AudioClient).
|
||||
# Keeps TTS synchronous so `is_speaking` is meaningful to the voice
|
||||
# loop that needs to skip mic input during playback.
|
||||
self._tts_engine = None
|
||||
if self._sdk_available:
|
||||
from Voice.builtin_tts import BuiltinTTS
|
||||
self._tts_engine = BuiltinTTS(
|
||||
self._client,
|
||||
default_speaker_id=self._tts.get("builtin_speaker_id", 0),
|
||||
)
|
||||
|
||||
# Data dir
|
||||
data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"])
|
||||
@ -82,7 +104,10 @@ class AudioAPI:
|
||||
self._speaking = False
|
||||
self._speak_lock = threading.Lock()
|
||||
|
||||
log.info(self._config["messages"]["ready"])
|
||||
log.info("%s (mic=%s, tts=%s)",
|
||||
self._config["messages"]["ready"],
|
||||
self._mic_backend,
|
||||
"builtin_ttsmaker" if self._tts_engine else "disabled")
|
||||
|
||||
def _init_sdk(self):
|
||||
"""Initialize Unitree AudioClient."""
|
||||
@ -105,55 +130,63 @@ class AudioAPI:
|
||||
|
||||
# ─── SPEAK ────────────────────────────────────────────
|
||||
|
||||
def speak(self, text: str, lang: str = "auto"):
|
||||
def speak(self, text: str, lang: str = "en"):
|
||||
"""
|
||||
Speak text in the given language.
|
||||
Mutes mic during playback to prevent self-listening.
|
||||
lang="en" → built-in TtsMaker
|
||||
lang="ar" → Piper → resample → G1 speaker
|
||||
lang="auto" → detect from text
|
||||
"""
|
||||
if lang == "auto":
|
||||
lang = self._detect_lang(text)
|
||||
Speak `text` in English through the G1 built-in TTS (TtsMaker).
|
||||
|
||||
log.info("[%s] speak: %s", lang.upper(), text[:80])
|
||||
Mutes (flushes) the mic during playback so the voice loop doesn't
|
||||
hear the robot's own voice and transcribe itself. The `lang`
|
||||
argument is accepted for API compatibility but only "en" plays —
|
||||
non-ASCII text (Arabic) is rejected by BuiltinTTS.
|
||||
"""
|
||||
if lang and lang not in ("en", "auto"):
|
||||
log.warning("builtin_tts only supports English; got lang=%r — skipping", lang)
|
||||
return
|
||||
if self._tts_engine is None:
|
||||
log.error("No TTS engine initialized — audio SDK unavailable")
|
||||
return
|
||||
|
||||
log.info("speak: %s", text[:80])
|
||||
|
||||
with self._speak_lock:
|
||||
self._speaking = True
|
||||
self._mute_mic()
|
||||
|
||||
try:
|
||||
if lang == "en":
|
||||
self._speak_english(text)
|
||||
elif lang == "ar":
|
||||
self._speak_arabic(text)
|
||||
else:
|
||||
log.warning("Unknown lang '%s', falling back to English", lang)
|
||||
self._speak_english(text)
|
||||
self._tts_engine.speak(text, block=True)
|
||||
except Exception as e:
|
||||
log.error("%s: %s", self._config["messages"]["error_tts"], e)
|
||||
finally:
|
||||
# Small delay so speaker fully stops before mic reopens
|
||||
time.sleep(0.3)
|
||||
# Small tail so the speaker fully finishes before the mic is
|
||||
# re-opened for capture
|
||||
time.sleep(0.2)
|
||||
self._unmute_mic()
|
||||
self._speaking = False
|
||||
|
||||
def _mute_mic(self):
|
||||
"""Mute the wireless mic to prevent self-listening."""
|
||||
"""
|
||||
Suppress mic input during TTS playback.
|
||||
For the UDP built-in mic, flush the buffer so we don't capture any
|
||||
echo that's already been queued. For the legacy PulseAudio path,
|
||||
actually mute the source.
|
||||
"""
|
||||
if self._mic_backend == "builtin_udp":
|
||||
if self._builtin_mic is not None:
|
||||
self._builtin_mic.flush()
|
||||
return
|
||||
source = self._mic["source_index"]
|
||||
subprocess.run(
|
||||
["pactl", "set-source-mute", source, "1"],
|
||||
capture_output=True,
|
||||
)
|
||||
subprocess.run(["pactl", "set-source-mute", source, "1"],
|
||||
capture_output=True)
|
||||
log.debug("Mic muted")
|
||||
|
||||
def _unmute_mic(self):
|
||||
"""Unmute the wireless mic."""
|
||||
"""Re-enable mic after TTS playback (pactl path only)."""
|
||||
if self._mic_backend == "builtin_udp":
|
||||
if self._builtin_mic is not None:
|
||||
self._builtin_mic.flush()
|
||||
return
|
||||
source = self._mic["source_index"]
|
||||
subprocess.run(
|
||||
["pactl", "set-source-mute", source, "0"],
|
||||
capture_output=True,
|
||||
)
|
||||
subprocess.run(["pactl", "set-source-mute", source, "0"],
|
||||
capture_output=True)
|
||||
log.debug("Mic unmuted")
|
||||
|
||||
@property
|
||||
@ -161,88 +194,8 @@ class AudioAPI:
|
||||
"""True while TTS is playing — voice module checks this."""
|
||||
return self._speaking
|
||||
|
||||
def _speak_english(self, text: str):
|
||||
"""English TTS via edge-tts."""
|
||||
self._speak_edge_tts(text, "en")
|
||||
|
||||
def _speak_arabic(self, text: str):
|
||||
"""Arabic TTS via edge-tts."""
|
||||
self._speak_edge_tts(text, "ar")
|
||||
|
||||
def speak_piper_en(self, text: str):
|
||||
"""Alternative: English via Piper instead of built-in."""
|
||||
voice = self._tts["piper_voice_en"]
|
||||
audio, rate = self._piper_synthesize(text, voice)
|
||||
audio_16k = self._resample(audio, rate)
|
||||
self._play_pcm(audio_16k)
|
||||
|
||||
# ─── PIPER TTS ────────────────────────────────────────
|
||||
|
||||
def _piper_synthesize(self, text: str, voice: str) -> tuple:
|
||||
"""Run Piper CLI, return (audio_int16, sample_rate)."""
|
||||
cmd = ["piper", "--model", voice, "--output_raw"]
|
||||
timeout = self._tts["piper_timeout_sec"]
|
||||
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
input=text.encode("utf-8"),
|
||||
capture_output=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
if proc.returncode != 0:
|
||||
stderr = proc.stderr.decode()[:300]
|
||||
raise RuntimeError(f"Piper failed: {stderr}")
|
||||
|
||||
audio = np.frombuffer(proc.stdout, dtype=np.int16)
|
||||
piper_rate = self._tts["piper_sample_rate"]
|
||||
log.info("Piper: %d samples @ %dHz (%.1fs)", len(audio), piper_rate, len(audio) / piper_rate)
|
||||
return audio, piper_rate
|
||||
|
||||
# ─── RESAMPLE ─────────────────────────────────────────
|
||||
|
||||
|
||||
def _speak_edge_tts(self, text: str, lang: str):
|
||||
"""Generate speech via edge-tts and play on G1."""
|
||||
import os as _os
|
||||
voice = "ar-AE-HamdanNeural" if lang == "ar" else "en-US-GuyNeural"
|
||||
ts = int(time.time() * 1000)
|
||||
mp3_path = f"/tmp/edge_{lang}_{ts}.mp3"
|
||||
wav_path = f"/tmp/edge_{lang}_{ts}.wav"
|
||||
|
||||
safe_text = text.replace('"', '\\"')
|
||||
code = f'import edge_tts, asyncio; asyncio.run(edge_tts.Communicate(\"{safe_text}\", voice=\"{voice}\").save(\"{mp3_path}\"))'
|
||||
result = subprocess.run(["python3", "-c", code], capture_output=True, text=True, timeout=30)
|
||||
|
||||
if result.returncode != 0:
|
||||
log.error("edge-tts failed: %s", result.stderr[:200])
|
||||
if lang == "en" and self._sdk_available:
|
||||
self._client.TtsMaker(text, self._tts.get("builtin_speaker_id", 1))
|
||||
time.sleep(max(2.0, len(text) * 0.06))
|
||||
return
|
||||
|
||||
try:
|
||||
from pydub import AudioSegment
|
||||
a = AudioSegment.from_mp3(mp3_path)
|
||||
a = a.set_frame_rate(16000).set_channels(1).set_sample_width(2)
|
||||
a.export(wav_path, format="wav")
|
||||
|
||||
import wave
|
||||
with wave.open(wav_path, "rb") as wf:
|
||||
audio = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)
|
||||
|
||||
_os.unlink(mp3_path)
|
||||
_os.unlink(wav_path)
|
||||
self._play_pcm(audio)
|
||||
except Exception as e:
|
||||
log.error("edge-tts conversion error: %s", e)
|
||||
try: _os.unlink(mp3_path)
|
||||
except: pass
|
||||
try: _os.unlink(wav_path)
|
||||
except: pass
|
||||
|
||||
def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray:
|
||||
"""Resample to target rate (16kHz)."""
|
||||
"""Linear resample int16 PCM to self._target_rate (16 kHz)."""
|
||||
if src_rate == self._target_rate:
|
||||
return audio
|
||||
tl = int(len(audio) * self._target_rate / src_rate)
|
||||
@ -252,7 +205,7 @@ class AudioAPI:
|
||||
audio.astype(np.float64),
|
||||
).astype(np.int16)
|
||||
|
||||
# ─── G1 SPEAKER PLAYBACK ─────────────────────────────
|
||||
# ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
|
||||
|
||||
def _play_pcm(self, audio_16k: np.ndarray) -> float:
|
||||
"""Play 16kHz mono int16 on G1 speaker. Returns duration."""
|
||||
@ -308,24 +261,50 @@ class AudioAPI:
|
||||
# ─── MIC RECORDING ───────────────────────────────────
|
||||
|
||||
def record(self, seconds: float = 5.0) -> np.ndarray:
|
||||
"""Record from Hollyland wireless mic via parec. Returns int16 array."""
|
||||
"""
|
||||
Capture `seconds` of int16 mono 16 kHz PCM.
|
||||
|
||||
Default backend is the G1 built-in mic (UDP multicast). Set
|
||||
mic.backend="pactl_parec" in config_Voice.json to use the
|
||||
legacy Hollyland/parec path instead.
|
||||
"""
|
||||
if self._mic_backend == "builtin_udp":
|
||||
return self._record_builtin(seconds)
|
||||
return self._record_parec(seconds)
|
||||
|
||||
def _record_builtin(self, seconds: float) -> np.ndarray:
|
||||
"""Built-in mic path — join UDP multicast, read the requested duration."""
|
||||
if self._builtin_mic is None:
|
||||
from Voice.builtin_mic import BuiltinMic
|
||||
mcfg = self._config.get("mic_udp", {})
|
||||
self._builtin_mic = BuiltinMic(
|
||||
group=mcfg.get("group", "239.168.123.161"),
|
||||
port=mcfg.get("port", 5555),
|
||||
buf_max=mcfg.get("buffer_max_bytes", 64000),
|
||||
)
|
||||
self._builtin_mic.start()
|
||||
time.sleep(0.2) # let the receiver thread fill in
|
||||
|
||||
log.info("Recording %.1fs from G1 built-in mic", seconds)
|
||||
raw = self._builtin_mic.read_seconds(seconds)
|
||||
audio = np.frombuffer(raw, dtype=np.int16)
|
||||
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
|
||||
if audio.std() < 50:
|
||||
log.warning(self._config["messages"]["error_mic"] +
|
||||
" — G1 mic silent (check audio service on robot)")
|
||||
return audio
|
||||
|
||||
def _record_parec(self, seconds: float) -> np.ndarray:
|
||||
"""Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'."""
|
||||
source = self._mic["source_index"]
|
||||
rate = str(self._mic["rate"])
|
||||
channels = str(self._mic["channels"])
|
||||
fmt = self._mic["format"]
|
||||
|
||||
# Unmute mic
|
||||
subprocess.run(
|
||||
["pactl", "set-source-mute", source, "0"],
|
||||
capture_output=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["pactl", "set-source-volume", source, "100%"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
log.info("Recording %.1fs from mic source %s", seconds, source)
|
||||
subprocess.run(["pactl", "set-source-mute", source, "0"], capture_output=True)
|
||||
subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True)
|
||||
|
||||
log.info("Recording %.1fs from mic source %s (parec)", seconds, source)
|
||||
proc = subprocess.Popen(
|
||||
["parec", "-d", source,
|
||||
f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"],
|
||||
@ -337,10 +316,8 @@ class AudioAPI:
|
||||
|
||||
audio = np.frombuffer(raw, dtype=np.int16)
|
||||
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
|
||||
|
||||
if audio.std() < 50:
|
||||
log.warning(self._config["messages"]["error_mic"] + " — mic may be silent")
|
||||
|
||||
return audio
|
||||
|
||||
def save_recording(self, audio: np.ndarray, name: str) -> str:
|
||||
@ -355,16 +332,6 @@ class AudioAPI:
|
||||
log.info("Saved: %s", path)
|
||||
return path
|
||||
|
||||
# ─── LANGUAGE DETECTION ───────────────────────────────
|
||||
|
||||
@staticmethod
|
||||
def _detect_lang(text: str) -> str:
|
||||
"""Detect language from text — Arabic Unicode range check."""
|
||||
for c in text:
|
||||
if '\u0600' <= c <= '\u06FF':
|
||||
return "ar"
|
||||
return "en"
|
||||
|
||||
# ─── STATUS ───────────────────────────────────────────
|
||||
|
||||
@property
|
||||
@ -378,27 +345,16 @@ if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Marcus Audio API Test")
|
||||
parser.add_argument("--test", action="store_true", help="Run speak tests")
|
||||
parser.add_argument("--speak", type=str, help="Speak this text")
|
||||
parser.add_argument("--lang", default="auto", help="Language: en, ar, auto")
|
||||
parser.add_argument("--test", action="store_true", help="Run TTS + record test")
|
||||
parser.add_argument("--speak", type=str, help="Speak this English text")
|
||||
parser.add_argument("--record", type=float, default=0, help="Record N seconds")
|
||||
args = parser.parse_args()
|
||||
|
||||
api = AudioAPI()
|
||||
|
||||
if args.test:
|
||||
print("\n--- English built-in ---")
|
||||
api.speak("Hello, I am Marcus.", "en")
|
||||
time.sleep(1)
|
||||
|
||||
print("\n--- Arabic Piper ---")
|
||||
api.speak("مرحبا، أنا ماركوس", "ar")
|
||||
time.sleep(1)
|
||||
|
||||
print("\n--- Auto-detect ---")
|
||||
api.speak("How are you?")
|
||||
time.sleep(1)
|
||||
api.speak("كيف حالك؟")
|
||||
print("\n--- English (TtsMaker) ---")
|
||||
api.speak("Hello, I am Sanad.")
|
||||
time.sleep(1)
|
||||
|
||||
print("\n--- Record 3s + playback ---")
|
||||
@ -408,7 +364,7 @@ if __name__ == "__main__":
|
||||
print("\nDone.")
|
||||
|
||||
elif args.speak:
|
||||
api.speak(args.speak, args.lang)
|
||||
api.speak(args.speak)
|
||||
|
||||
elif args.record > 0:
|
||||
rec = api.record(args.record)
|
||||
|
||||
@ -49,9 +49,28 @@ def init_yolo(raw_frame_ref, frame_lock) -> bool:
|
||||
print(f"marcus_yolo.py not found ({e})")
|
||||
return False
|
||||
|
||||
# GPU is required — let RuntimeError from _resolve_device propagate so
|
||||
# Marcus hard-fails at startup instead of silently running without vision.
|
||||
ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock)
|
||||
# GPU is required. _resolve_device() raises RuntimeError when CUDA is
|
||||
# missing — surface that with an actionable banner before re-raising so
|
||||
# Marcus hard-fails with a clear error instead of a raw stack trace.
|
||||
try:
|
||||
ok = start_yolo(raw_frame_ref=raw_frame_ref, frame_lock=frame_lock)
|
||||
except RuntimeError as e:
|
||||
print()
|
||||
print("╔" + "═" * 68 + "╗")
|
||||
print("║ MARCUS STARTUP ABORTED — GPU REQUIRED".ljust(69) + "║")
|
||||
print("╠" + "═" * 68 + "╣")
|
||||
print(f"║ {str(e)[:66]:<66} ║")
|
||||
print("║" + " " * 68 + "║")
|
||||
print("║ On the Jetson, verify:".ljust(69) + "║")
|
||||
print("║ tegrastats # GPU exists & is not throttled".ljust(69) + "║")
|
||||
print("║ python3 -c 'import torch; print(torch.cuda.is_available())'".ljust(69) + "║")
|
||||
print("║ nvcc --version # CUDA toolkit reachable".ljust(69) + "║")
|
||||
print("║ Expected: torch 2.1.0 nv23.06, CUDA 11.4, GPU=Orin.".ljust(69) + "║")
|
||||
print("║ See Doc/environment.md section 9 for the reinstall recipe.".ljust(69) + "║")
|
||||
print("╚" + "═" * 68 + "╝")
|
||||
print()
|
||||
raise
|
||||
|
||||
if ok:
|
||||
YOLO_AVAILABLE = True
|
||||
yolo_sees = _ys
|
||||
|
||||
@ -1,7 +1,16 @@
|
||||
"""
|
||||
zmq_api.py — ZMQ velocity + command interface to Holosoma
|
||||
|
||||
Previously the PUB socket was bound at module import time. That made the
|
||||
module unsafe to re-import from any multiprocessing child (e.g. the LiDAR
|
||||
SLAM_worker spawn), because the child would try to rebind the same port
|
||||
and crash with `Address already in use`.
|
||||
|
||||
The bind now lives in init_zmq() — call it once from the brain entrypoint.
|
||||
Child processes can import this module without any network side effects.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import zmq
|
||||
from Core.config_loader import load_config
|
||||
@ -15,35 +24,62 @@ STOP_ITERATIONS = _cfg["stop_iterations"]
|
||||
STOP_DELAY = _cfg["stop_delay"]
|
||||
STEP_PAUSE = _cfg["step_pause"]
|
||||
|
||||
ctx = zmq.Context()
|
||||
sock = ctx.socket(zmq.PUB)
|
||||
sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
|
||||
time.sleep(0.5)
|
||||
log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT}", "info", "zmq")
|
||||
# Shared state. These stay None until init_zmq() is called.
|
||||
ctx: zmq.Context = None
|
||||
sock: zmq.Socket = None
|
||||
_INIT_SETTLE = 0.5 # seconds to let PUB tell subscribers it's alive
|
||||
|
||||
|
||||
def init_zmq() -> zmq.Socket:
|
||||
"""
|
||||
Bind the PUB socket. Idempotent — safe to call more than once.
|
||||
Call from the main (parent) process only. Do NOT call from multiprocessing
|
||||
children — they inherit nothing useful from the bound socket anyway.
|
||||
"""
|
||||
global ctx, sock
|
||||
if sock is not None:
|
||||
return sock
|
||||
ctx = zmq.Context()
|
||||
sock = ctx.socket(zmq.PUB)
|
||||
sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
|
||||
time.sleep(_INIT_SETTLE)
|
||||
log(f"ZMQ PUB bound on tcp://{ZMQ_HOST}:{ZMQ_PORT} (pid={os.getpid()})",
|
||||
"info", "zmq")
|
||||
return sock
|
||||
|
||||
|
||||
def _ensure_sock() -> zmq.Socket:
|
||||
if sock is None:
|
||||
raise RuntimeError(
|
||||
"zmq_api not initialized — call init_zmq() from the brain "
|
||||
"entrypoint before using send_vel/send_cmd/gradual_stop"
|
||||
)
|
||||
return sock
|
||||
|
||||
|
||||
def get_socket():
|
||||
"""Return the shared ZMQ PUB socket (for odometry to reuse)."""
|
||||
return sock
|
||||
return _ensure_sock()
|
||||
|
||||
|
||||
def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0):
|
||||
"""Send velocity to Holosoma. vx m/s | vy m/s | vyaw rad/s"""
|
||||
sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
|
||||
_ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
|
||||
|
||||
|
||||
def gradual_stop():
|
||||
"""Smooth deceleration to zero over ~1 second."""
|
||||
s = _ensure_sock()
|
||||
for _ in range(STOP_ITERATIONS):
|
||||
send_vel(0.0, 0.0, 0.0)
|
||||
s.send_string(json.dumps({"vel": {"vx": 0.0, "vy": 0.0, "vyaw": 0.0}}))
|
||||
time.sleep(STOP_DELAY)
|
||||
|
||||
|
||||
def send_cmd(cmd: str):
|
||||
"""Send Holosoma state command: start | walk | stand | stop"""
|
||||
sock.send_string(json.dumps({"cmd": cmd}))
|
||||
_ensure_sock().send_string(json.dumps({"cmd": cmd}))
|
||||
|
||||
|
||||
# Load MOVE_MAP from navigation config
|
||||
# Load MOVE_MAP from navigation config (pure data, safe at import time)
|
||||
_nav = load_config("Navigation")
|
||||
MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}
|
||||
|
||||
@ -292,7 +292,10 @@ class AutonomousMode:
|
||||
self._enabled = False
|
||||
break
|
||||
|
||||
time.sleep(YOLO_CHECK_INTERVAL)
|
||||
# No trailing sleep — _move_forward() takes FORWARD_DURATION,
|
||||
# _turn() takes TURN_DURATION, and LLaVA assessment is ~1-2s.
|
||||
# The body always consumes real wall time, so an extra sleep here
|
||||
# would be pure dead time.
|
||||
|
||||
# Clean up
|
||||
self._gradual_stop()
|
||||
|
||||
@ -17,7 +17,7 @@ PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
if PROJECT_DIR not in sys.path:
|
||||
sys.path.insert(0, PROJECT_DIR)
|
||||
|
||||
from API.zmq_api import send_vel, gradual_stop, send_cmd
|
||||
from API.zmq_api import init_zmq, send_vel, gradual_stop, send_cmd
|
||||
from API.camera_api import start_camera, stop_camera, get_frame
|
||||
from API.yolo_api import (
|
||||
init_yolo, yolo_summary, yolo_fps,
|
||||
@ -70,7 +70,19 @@ _NAT_GOAL_RE = re.compile(
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def init_brain():
|
||||
"""Initialize all subsystems. Call once at startup."""
|
||||
"""Initialize all subsystems. Call once at startup from the parent process.
|
||||
|
||||
Optional subsystems (lidar / voice / imgsearch / autonomous) are gated on
|
||||
`config_Brain.json::subsystems.<name>`. Disabling the ones you don't need
|
||||
brings Marcus's boot time down from ~18 s to ~5-7 s.
|
||||
"""
|
||||
subsys = _cfg.get("subsystems", {}) or {}
|
||||
|
||||
# Bind the ZMQ PUB socket before anything tries to publish on it.
|
||||
# This is now explicit (previously it happened as an import side effect,
|
||||
# which crashed every multiprocessing child that re-imported zmq_api).
|
||||
init_zmq()
|
||||
|
||||
raw_frame, raw_lock = start_camera()
|
||||
init_yolo(raw_frame, raw_lock)
|
||||
|
||||
@ -79,53 +91,65 @@ def init_brain():
|
||||
|
||||
init_memory()
|
||||
|
||||
# LiDAR (optional — continues without it)
|
||||
try:
|
||||
from API.lidar_api import init_lidar
|
||||
init_lidar()
|
||||
except Exception as e:
|
||||
print(f" [LiDAR] Init failed: {e} — continuing without LiDAR")
|
||||
# LiDAR — optional
|
||||
if subsys.get("lidar", True):
|
||||
try:
|
||||
from API.lidar_api import init_lidar
|
||||
init_lidar()
|
||||
except Exception as e:
|
||||
print(f" [LiDAR] Init failed: {e} — continuing without LiDAR")
|
||||
else:
|
||||
print(" [LiDAR] disabled by config")
|
||||
|
||||
init_imgsearch(
|
||||
get_frame_fn=get_frame,
|
||||
send_vel_fn=send_vel,
|
||||
gradual_stop_fn=gradual_stop,
|
||||
llava_fn=call_llava,
|
||||
yolo_sees_fn=yolo_sees,
|
||||
model=OLLAMA_MODEL,
|
||||
)
|
||||
# Image search — optional
|
||||
if subsys.get("imgsearch", False):
|
||||
init_imgsearch(
|
||||
get_frame_fn=get_frame,
|
||||
send_vel_fn=send_vel,
|
||||
gradual_stop_fn=gradual_stop,
|
||||
llava_fn=call_llava,
|
||||
yolo_sees_fn=yolo_sees,
|
||||
model=OLLAMA_MODEL,
|
||||
)
|
||||
else:
|
||||
print(" [ImgSearch] disabled by config")
|
||||
|
||||
# Autonomous exploration mode
|
||||
from API.memory_api import mem as _mem_ref
|
||||
from API.llava_api import PATROL_PROMPT
|
||||
auto = AutonomousMode(
|
||||
get_frame_fn=get_frame,
|
||||
send_vel_fn=send_vel,
|
||||
gradual_stop_fn=gradual_stop,
|
||||
yolo_sees_fn=yolo_sees,
|
||||
yolo_summary_fn=yolo_summary,
|
||||
yolo_all_classes_fn=yolo_all_classes,
|
||||
yolo_closest_fn=yolo_closest,
|
||||
odom_fn=lambda: {"x": 0, "y": 0, "heading": 0}, # fallback if no odom
|
||||
call_llava_fn=call_llava,
|
||||
patrol_prompt=PATROL_PROMPT,
|
||||
mem=_mem_ref,
|
||||
)
|
||||
# Wire odometry if available
|
||||
from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE
|
||||
if _odom_ref and ODOM_AVAILABLE:
|
||||
auto._odom_pos = lambda: {
|
||||
"x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading
|
||||
}
|
||||
init_autonomous(auto)
|
||||
# Autonomous exploration mode — optional
|
||||
if subsys.get("autonomous", True):
|
||||
from API.memory_api import mem as _mem_ref
|
||||
from API.llava_api import PATROL_PROMPT
|
||||
auto = AutonomousMode(
|
||||
get_frame_fn=get_frame,
|
||||
send_vel_fn=send_vel,
|
||||
gradual_stop_fn=gradual_stop,
|
||||
yolo_sees_fn=yolo_sees,
|
||||
yolo_summary_fn=yolo_summary,
|
||||
yolo_all_classes_fn=yolo_all_classes,
|
||||
yolo_closest_fn=yolo_closest,
|
||||
odom_fn=lambda: {"x": 0, "y": 0, "heading": 0},
|
||||
call_llava_fn=call_llava,
|
||||
patrol_prompt=PATROL_PROMPT,
|
||||
mem=_mem_ref,
|
||||
)
|
||||
from API.odometry_api import odom as _odom_ref, ODOM_AVAILABLE
|
||||
if _odom_ref and ODOM_AVAILABLE:
|
||||
auto._odom_pos = lambda: {
|
||||
"x": _odom_ref._x, "y": _odom_ref._y, "heading": _odom_ref._heading
|
||||
}
|
||||
init_autonomous(auto)
|
||||
else:
|
||||
print(" [Autonomous] disabled by config")
|
||||
|
||||
send_cmd("start")
|
||||
time.sleep(0.5)
|
||||
send_cmd("walk")
|
||||
time.sleep(0.5)
|
||||
|
||||
# Voice module (optional — continues without it)
|
||||
_init_voice()
|
||||
# Voice module — optional
|
||||
if subsys.get("voice", True):
|
||||
_init_voice()
|
||||
else:
|
||||
print(" [Voice] disabled by config")
|
||||
|
||||
_log("Brain initialized", "info", "brain")
|
||||
_warmup_llava()
|
||||
@ -137,44 +161,37 @@ _voice_module = None
|
||||
|
||||
|
||||
def _init_voice():
|
||||
"""Initialize voice module — runs in background, calls process_command on speech."""
|
||||
"""
|
||||
Initialize the voice subsystem: G1 built-in mic + Whisper STT + G1
|
||||
built-in TtsMaker for replies. Every transcribed command flows through
|
||||
process_command(), and the resulting `speak` string is sent to the G1
|
||||
speaker.
|
||||
"""
|
||||
global _audio_api, _voice_module
|
||||
try:
|
||||
from API.audio_api import AudioAPI
|
||||
from Voice.marcus_gemini_voice import GeminiVoiceModule as VoiceModule
|
||||
from Voice.marcus_voice import VoiceModule
|
||||
|
||||
_audio_api = AudioAPI()
|
||||
|
||||
def _voice_callback(text, role):
|
||||
"""Gemini voice callback."""
|
||||
pass # handled below
|
||||
if role != "user" or not text.strip():
|
||||
def _on_command(text, lang):
|
||||
text = (text or "").strip()
|
||||
if not text:
|
||||
return
|
||||
t = text.strip().lower()
|
||||
act_kw = ["turn","move","go","walk","step","stop","come","wave","clap",
|
||||
"high five","shake","hug","forward","backward","left","right",
|
||||
"what do you see","what can you see","look","describe","patrol",
|
||||
"دور","امشي","روح","تقدم","ارجع","وقف","قف","تعال",
|
||||
"يمين","يسار","قدام","ورا","لوح","صفق","سلم",
|
||||
"شو شايف","شو تشوف","ماذا ترى","شو قدامك","لف","خطوات"]
|
||||
if any(kw in t for kw in act_kw):
|
||||
print(f" [Brain] Action: {text.strip()}")
|
||||
try:
|
||||
result = process_command(text.strip())
|
||||
if isinstance(result, dict):
|
||||
sp = result.get("speak", "")
|
||||
vis_kw = ["see","look","describe","شايف","تشوف","ترى","قدامك"]
|
||||
if any(k in t for k in vis_kw) and sp and _audio_api:
|
||||
print(f" [Brain] Vision: {sp}")
|
||||
_audio_api.speak(sp)
|
||||
except Exception as e:
|
||||
print(f" [Brain] Error: {e}")
|
||||
else:
|
||||
print(f" [Chat] {text.strip()}")
|
||||
print(f" [Voice] {text}")
|
||||
try:
|
||||
result = process_command(text)
|
||||
except Exception as e:
|
||||
print(f" [Brain] Error processing voice command: {e}")
|
||||
return
|
||||
if isinstance(result, dict):
|
||||
sp = (result.get("speak") or "").strip()
|
||||
if sp and _audio_api:
|
||||
_audio_api.speak(sp)
|
||||
|
||||
_voice_module = VoiceModule(_audio_api, on_transcript=_voice_callback)
|
||||
_voice_module = VoiceModule(_audio_api, on_command=_on_command)
|
||||
_voice_module.start()
|
||||
print(f" [Voice] Always listening (Gemini voice)")
|
||||
print(" [Voice] Always listening (Whisper + G1 mic + TtsMaker)")
|
||||
except Exception as e:
|
||||
print(f" [Voice] Init failed: {e} — continuing without voice")
|
||||
_audio_api = None
|
||||
@ -255,7 +272,7 @@ def process_command(cmd: str) -> dict:
|
||||
|
||||
# ── Greeting ─────────────────────────────────────────────────────────
|
||||
if re.match(r"^(?:hi+|hey+|hello+|hola|salam|marhaba|sup|yo+|ahlan|السلام عليكم|مرحبا|أهلا|هلا|يا هلا)\s*[!.]*$", cmd, re.IGNORECASE):
|
||||
response = "Hello! I am Marcus. How can I help you?"
|
||||
response = "Hello! I am Sanad. How can I help you?"
|
||||
print(f"Marcus: {response}")
|
||||
add_to_history(cmd, response)
|
||||
log_cmd(cmd, response)
|
||||
@ -346,10 +363,15 @@ def _handle_llava(cmd):
|
||||
t0 = time.time()
|
||||
img = get_frame()
|
||||
|
||||
# Poll up to 500 ms in 50 ms slices instead of blocking a full second.
|
||||
# Returns the moment a frame is available — most drops recover in <100 ms.
|
||||
if img is None:
|
||||
print(" Waiting for camera...")
|
||||
time.sleep(1.0)
|
||||
img = get_frame()
|
||||
for _ in range(10):
|
||||
time.sleep(0.05)
|
||||
img = get_frame()
|
||||
if img is not None:
|
||||
break
|
||||
|
||||
if img is None:
|
||||
print(" Camera not ready — command cancelled")
|
||||
@ -461,7 +483,7 @@ def run_terminal():
|
||||
status = get_brain_status()
|
||||
print()
|
||||
print("=" * 48)
|
||||
print(" MARCUS AI BRAIN — READY")
|
||||
print(" SANAD AI BRAIN — READY")
|
||||
print("=" * 48)
|
||||
for k, v in status.items():
|
||||
print(f" {k:<10}: {v}")
|
||||
|
||||
@ -3,13 +3,19 @@
|
||||
"max_history": 6,
|
||||
"num_batch": 128,
|
||||
"num_ctx": 2048,
|
||||
"num_predict_main": 200,
|
||||
"subsystems": {
|
||||
"lidar": true,
|
||||
"voice": true,
|
||||
"imgsearch": false,
|
||||
"autonomous": true
|
||||
},
|
||||
"num_predict_main": 120,
|
||||
"num_predict_goal": 80,
|
||||
"num_predict_patrol": 100,
|
||||
"num_predict_talk": 80,
|
||||
"num_predict_verify": 10,
|
||||
"warmup_num_predict": 5,
|
||||
"main_prompt": "You are Marcus, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\" -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\" -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\" -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\" -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:",
|
||||
"goal_prompt": "You are Marcus navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:",
|
||||
"patrol_prompt": "You are Marcus, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:"
|
||||
"main_prompt": "You are Sanad, a humanoid robot. Look at the image and follow the command.\n{facts}\n\nCommand: \"{command}\"\n\nReply with ONLY this JSON — no markdown, no explanation:\n{{\"actions\":[{{\"move\":\"forward|backward|left|right|stop\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"one sentence\",\"abort\":null}}\n\nRULES:\n- actions is a list of movement steps, max duration 5.0s each\n- move: \"forward\" \"backward\" \"left\" \"right\" \"stop\"\n- arm: \"wave\" \"raise_right\" \"raise_left\" \"clap\" \"high_five\" \"hug\" \"heart\" \"shake_hand\" \"face_wave\" or null\n- arm is NEVER a move value\n- questions/descriptions: actions=[]\n- obstacle < 0.5m: abort = \"obstacle detected\"\n- \"90 degrees\" = 5.0s | \"45 degrees\" = 2.5s | \"1 step\" = 1.0s\n\nEXAMPLES:\n\"turn right\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Turning right\",\"abort\":null}}\n\"turn right 90 degrees\" -> {{\"actions\":[{{\"move\":\"right\",\"duration\":5.0}}],\"arm\":null,\"speak\":\"Turning 90 degrees\",\"abort\":null}}\n\"move back then left\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}},{{\"move\":\"left\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving back then left\",\"abort\":null}}\n\"wave\" -> {{\"actions\":[],\"arm\":\"wave\",\"speak\":\"Waving\",\"abort\":null}}\n\"raise your right arm\" -> {{\"actions\":[],\"arm\":\"raise_right\",\"speak\":\"Raising right arm\",\"abort\":null}}\n\"walk forward and wave\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":\"wave\",\"speak\":\"Walking and waving\",\"abort\":null}}\n\"what do you see\" -> {{\"actions\":[],\"arm\":null,\"speak\":\"I see...\",\"abort\":null}}\n\"stop\" -> {{\"actions\":[{{\"move\":\"stop\",\"duration\":0}}],\"arm\":null,\"speak\":\"Stopping\",\"abort\":null}}\n\"come to me\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming to you\",\"abort\":null}}\n\"come back\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming back\",\"abort\":null}}\n\"come here\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Coming\",\"abort\":null}}\n\"get closer\" -> {{\"actions\":[{{\"move\":\"forward\",\"duration\":1.0}}],\"arm\":null,\"speak\":\"Moving closer\",\"abort\":null}}\n\"go away\" -> {{\"actions\":[{{\"move\":\"backward\",\"duration\":2.0}}],\"arm\":null,\"speak\":\"Moving away\",\"abort\":null}}\n\nCommand: \"{command}\"\nJSON:",
|
||||
"goal_prompt": "You are Sanad navigating toward a goal.\n\nGOAL: \"{goal}\"\n\nLook at the image. Have you reached the goal?\n\nReply ONLY this JSON:\n{{\"reached\":false,\"next_move\":\"left\",\"duration\":0.5,\"speak\":\"what you see\"}}\n\nRULES:\n- reached: true ONLY if you clearly see the goal target right now\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.3 to 0.8 seconds\n- Default next_move: \"left\" to keep scanning\n\nGOAL: \"{goal}\"\nJSON:",
|
||||
"patrol_prompt": "You are Sanad, an HSE inspection robot on autonomous patrol.\n\nLook at the camera and assess the scene.\n\nReply ONLY this JSON:\n{{\"observation\":\"one sentence\",\"alert\":null,\"next_move\":\"forward\",\"duration\":1.0}}\n\nRULES:\n- alert = null if safe\n- alert = \"PPE: no helmet\" if person without helmet\n- alert = \"PPE: no vest\" if person without safety vest\n- alert = \"Hazard: description\" for other hazards\n- next_move: \"forward\" \"left\" \"right\"\n- duration: 0.5 to 2.0s\n\nJSON:"
|
||||
}
|
||||
|
||||
@ -1,55 +1,46 @@
|
||||
{
|
||||
"tts": {
|
||||
"piper_voice_ar": "ar_JO-kareem-medium",
|
||||
"piper_voice_en": "en_US-lessac-medium",
|
||||
"piper_sample_rate": 22050,
|
||||
"backend": "builtin_ttsmaker",
|
||||
"builtin_speaker_id": 0,
|
||||
"target_sample_rate": 16000,
|
||||
"piper_timeout_sec": 120,
|
||||
"en_backend": "edge_tts",
|
||||
"ar_backend": "edge_tts",
|
||||
"edge_voice_ar": "ar-AE-HamdanNeural",
|
||||
"edge_voice_en": "en-US-GuyNeural"
|
||||
"target_sample_rate": 16000
|
||||
},
|
||||
"stt": {
|
||||
"wake_model": "tiny",
|
||||
"command_model": "small",
|
||||
"wake_words_en": [
|
||||
"marcus",
|
||||
"marcos",
|
||||
"markus"
|
||||
],
|
||||
"wake_words_ar": [
|
||||
"ماركوس",
|
||||
"مارکوس",
|
||||
"ماركس"
|
||||
],
|
||||
"language": null,
|
||||
"wake_words_en": ["sanad", "sannad", "sanat", "sunnat"],
|
||||
"language": "en",
|
||||
"command_timeout_sec": 10,
|
||||
"silence_threshold": 500,
|
||||
"silence_duration_sec": 1.5,
|
||||
"max_record_sec": 15
|
||||
},
|
||||
"mic": {
|
||||
"backend": "builtin_udp",
|
||||
"source_index": "3",
|
||||
"format": "s16le",
|
||||
"rate": 16000,
|
||||
"channels": 1
|
||||
},
|
||||
"mic_udp": {
|
||||
"group": "239.168.123.161",
|
||||
"port": 5555,
|
||||
"buffer_max_bytes": 64000,
|
||||
"read_timeout_sec": 0.04
|
||||
},
|
||||
"speaker": {
|
||||
"dds_interface": "eth0",
|
||||
"volume": 100,
|
||||
"app_name": "marcus"
|
||||
"app_name": "sanad"
|
||||
},
|
||||
"audio": {
|
||||
"data_dir": "Data/Voice/Recordings",
|
||||
"log_file": "logs/voice.log"
|
||||
},
|
||||
"messages": {
|
||||
"wake_heard": "Listening...",
|
||||
"no_speech": "I didn't catch that. Say my name again.",
|
||||
"wake_heard": "Listening",
|
||||
"no_speech": "I didn't catch that, please say it again",
|
||||
"error_tts": "Speech synthesis failed",
|
||||
"error_mic": "Microphone error",
|
||||
"ready": "Voice system ready"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -26,7 +26,7 @@
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
main_prompt: |
|
||||
You are Marcus — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai.
|
||||
You are Sanad — an advanced humanoid robot assistant built by YS Lootah Technology, Dubai.
|
||||
You are physically present in the room. You have a body, arms, and a camera.
|
||||
You follow commands from your operator and respond intelligently.
|
||||
{facts}
|
||||
@ -184,7 +184,7 @@ main_prompt: |
|
||||
→ {{"actions":[],"arm":null,"speak":"Nice to meet you Kassam, I will remember that","abort":null}}
|
||||
|
||||
"who are you"
|
||||
→ {{"actions":[],"arm":null,"speak":"I am Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}}
|
||||
→ {{"actions":[],"arm":null,"speak":"I am Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
|
||||
|
||||
Safety:
|
||||
"walk into the wall"
|
||||
@ -307,7 +307,7 @@ main_prompt: |
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
goal_prompt: |
|
||||
You are Marcus, a humanoid robot actively navigating toward a specific target.
|
||||
You are Sanad, a humanoid robot actively navigating toward a specific target.
|
||||
|
||||
YOUR MISSION: "{goal}"
|
||||
|
||||
@ -392,7 +392,7 @@ goal_prompt: |
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
patrol_prompt: |
|
||||
You are Marcus, a humanoid robot autonomously exploring and mapping an office environment.
|
||||
You are Sanad, a humanoid robot autonomously exploring and mapping an office environment.
|
||||
|
||||
Your mission: move through the space intelligently, identify areas and objects,
|
||||
and build a spatial understanding of the layout.
|
||||
@ -463,7 +463,7 @@ patrol_prompt: |
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
talk_prompt: |
|
||||
You are Marcus, a humanoid robot assistant. You have been asked a question
|
||||
You are Sanad, a humanoid robot assistant. You have been asked a question
|
||||
or given information. Do NOT move — just respond intelligently.
|
||||
{facts}
|
||||
|
||||
@ -509,7 +509,7 @@ talk_prompt: |
|
||||
→ {{"actions":[],"arm":null,"speak":"All systems are functioning normally and I am ready to assist you","abort":null}}
|
||||
|
||||
"what is your name"
|
||||
→ {{"actions":[],"arm":null,"speak":"My name is Marcus, a humanoid robot assistant by YS Lootah Technology","abort":null}}
|
||||
→ {{"actions":[],"arm":null,"speak":"My name is Sanad, a humanoid robot assistant by YS Lootah Technology","abort":null}}
|
||||
|
||||
"who built you"
|
||||
→ {{"actions":[],"arm":null,"speak":"I was built by YS Lootah Technology in Dubai, and my programmer is Kassam","abort":null}}
|
||||
|
||||
@ -1,9 +1,13 @@
|
||||
"""
|
||||
logger.py — Project-wide logging via Logger.py
|
||||
logger.py — Project-wide configured logging instance.
|
||||
|
||||
Imports the `Logs` backend class from log_backend.py (formerly Logger.py;
|
||||
renamed to avoid a case-only filename collision with this module, which
|
||||
breaks any case-insensitive filesystem — macOS default HFS+/APFS, Windows).
|
||||
"""
|
||||
import os
|
||||
from Core.env_loader import PROJECT_ROOT
|
||||
from Core.Logger import Logs
|
||||
from Core.log_backend import Logs
|
||||
|
||||
# Single shared instance — all modules use this
|
||||
_logs = Logs(main_log_file=os.path.join(PROJECT_ROOT, "logs", "main.log"))
|
||||
|
||||
@ -1,8 +1,37 @@
|
||||
# Marcus — Full API & Developer Reference
|
||||
|
||||
**Project:** Marcus | YS Lootah Technology | Jetson Orin NX + G1 EDU
|
||||
**Scripts:** `~/Models_marcus/marcus_llava.py` + `~/Models_marcus/marcus_yolo.py`
|
||||
**Updated:** April 4, 2026
|
||||
**Robot persona:** Sanad (wake word + self-intro; project code stays under `Marcus/`)
|
||||
**Entry points:** `run_marcus.py` (terminal) / `Server/marcus_server.py` (WebSocket)
|
||||
**Updated:** 2026-04-21
|
||||
|
||||
> **What changed since the early draft (April 4):** The project was restructured
|
||||
> from two monolithic scripts (`marcus_llava.py` + `marcus_yolo.py`) into a
|
||||
> layered architecture. See `Doc/architecture.md` for the current file tree and
|
||||
> `Doc/environment.md` for the verified Jetson software stack, exact library
|
||||
> versions, and GPU bring-up recipe. This reference still describes the
|
||||
> function-level semantics (inputs/outputs/examples) — treat any file path in
|
||||
> this document as illustrative and cross-check the actual module. Recent
|
||||
> deltas called out inline below.
|
||||
|
||||
### Recent API deltas (2026-04-21)
|
||||
|
||||
| Change | Location | Note |
|
||||
|---|---|---|
|
||||
| GPU is mandatory for YOLO | `Config/config_Vision.json`, `Vision/marcus_yolo.py` | `yolo_device` defaults to `"cuda"` and is enforced; `_resolve_device()` raises `RuntimeError` on missing CUDA. `yolo_half=true` runs FP16 on Orin (capability 8.7). |
|
||||
| Ollama model | `Config/config_Brain.json` | Default `ollama_model` is `qwen2.5vl:3b` (not `llava:7b`). |
|
||||
| Ollama compute-graph caps | `Config/config_Brain.json` | `num_batch=128`, `num_ctx=2048` — required on 16 GB Orin NX to prevent the llama runner OOM. Propagated by `API/llava_api.py` and `Vision/marcus_imgsearch.py` to every `ollama.chat` call. |
|
||||
| `num_predict_main` lowered | `Config/config_Brain.json` | 200 → 120 (shaves ~400–600 ms per open-ended command; JSON still parses). |
|
||||
| ZMQ bind moved out of import | `API/zmq_api.py` | `init_zmq()` must be called from the main process before any `send_vel/send_cmd`. `init_brain()` does this. Children spawned via `multiprocessing` no longer collide on port 5556. |
|
||||
| Camera-retry poll | `Brain/marcus_brain.py::_handle_llava` | Replaced `time.sleep(1.0)` with 10×50 ms polls. |
|
||||
| Conditional scan sleeps | `Navigation/goal_nav.py`, `Autonomous/marcus_autonomous.py` | Removed unconditional per-step naps when real work (YOLO hit, LLaVA call, forward move) already consumed wall time. |
|
||||
| Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
|
||||
| Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
|
||||
| Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
|
||||
| Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. |
|
||||
| Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
|
||||
| Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
|
||||
| Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |
|
||||
|
||||
---
|
||||
|
||||
@ -22,38 +51,54 @@
|
||||
12. [JSON Schema Reference](#12-json-schema-reference)
|
||||
13. [Environment & Paths](#13-environment--paths)
|
||||
14. [Quick Reference Card](#14-quick-reference-card)
|
||||
15. [Voice API (mic + TTS + STT)](#15-voice-api-mic--tts--stt)
|
||||
|
||||
---
|
||||
|
||||
## 1. Configuration Variables
|
||||
|
||||
Defined at the top of `marcus_llava.py`. Edit here to change global behavior.
|
||||
All configuration is now **JSON-driven** and lives under `Config/`. Each module
|
||||
loads its config at startup via `Core.config_loader.load_config(name)`.
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `ZMQ_HOST` | `"127.0.0.1"` | Holosoma ZMQ host |
|
||||
| `ZMQ_PORT` | `5556` | Holosoma ZMQ port |
|
||||
| `ZMQ_YOLO_PORT` | `5557` | YOLO ZMQ port (standalone mode) |
|
||||
| `OLLAMA_MODEL` | `"llava:7b"` | LLaVA model via Ollama |
|
||||
| `CAM_WIDTH` | `424` | Camera capture width (px) |
|
||||
| `CAM_HEIGHT` | `240` | Camera capture height (px) |
|
||||
| `CAM_FPS` | `15` | Camera frame rate |
|
||||
| `CAM_QUALITY` | `70` | JPEG quality sent to LLaVA |
|
||||
| `STOP_ITERATIONS` | `20` | gradual_stop message count |
|
||||
| `STOP_DELAY` | `0.05` | seconds between stop messages |
|
||||
| `STEP_PAUSE` | `0.3` | pause between consecutive action steps |
|
||||
| `ARM_SDK_PATH` | `/home/unitree/unitree_sdk2_python` | Arm SDK path |
|
||||
| `ARM_INTERFACE` | `"eth0"` | Network interface for arm SDK |
|
||||
**`Config/config_ZMQ.json`** (Holosoma bridge)
|
||||
|
||||
Defined at top of `marcus_yolo.py`:
|
||||
| Key | Default | Description |
|
||||
|---|---|---|
|
||||
| `zmq_host` | `"127.0.0.1"` | Holosoma ZMQ host |
|
||||
| `zmq_port` | `5556` | Holosoma ZMQ port |
|
||||
| `stop_iterations` | `20` | `gradual_stop()` message count |
|
||||
| `stop_delay` | `0.05` | seconds between stop messages |
|
||||
| `step_pause` | `0.3` | pause between consecutive action steps |
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `YOLO_MODEL_PATH` | `.../Model/yolov8m.pt` | YOLO model path |
|
||||
| `YOLO_CONFIDENCE` | `0.45` | Minimum detection confidence |
|
||||
| `YOLO_IOU` | `0.45` | NMS IOU threshold |
|
||||
| `YOLO_DEVICE` | `"cpu"` | Inference device ("cpu" or "cuda") |
|
||||
| `YOLO_IMG_SIZE` | `320` | Inference image size (smaller = faster) |
|
||||
**`Config/config_Brain.json`** (Ollama VL model)
|
||||
|
||||
| Key | Default | Description |
|
||||
|---|---|---|
|
||||
| `ollama_model` | `"qwen2.5vl:3b"` | Ollama model tag |
|
||||
| `max_history` | `6` | conversation turns retained |
|
||||
| `num_batch` | `128` | llama.cpp batch — **cap, required for Jetson** |
|
||||
| `num_ctx` | `2048` | llama.cpp KV context length — **cap, required for Jetson** |
|
||||
| `num_predict_main` | `120` | max tokens for the main command path |
|
||||
| `num_predict_goal` | `80` | goal-navigation call |
|
||||
| `num_predict_patrol` | `100` | autonomous patrol call |
|
||||
| `num_predict_talk` | `80` | talk-only path |
|
||||
| `num_predict_verify` | `10` | YOLO condition verifier (`yes`/`no`) |
|
||||
|
||||
**`Config/config_Vision.json`** (YOLO)
|
||||
|
||||
| Key | Default | Description |
|
||||
|---|---|---|
|
||||
| `yolo_model_path` | `"Models/yolov8m.pt"` | weights file (auto-fetched if missing) |
|
||||
| `yolo_confidence` | `0.45` | detection confidence threshold |
|
||||
| `yolo_iou` | `0.45` | NMS IOU threshold |
|
||||
| `yolo_device` | `"cuda"` | **GPU required** — `"cpu"` raises `RuntimeError` |
|
||||
| `yolo_half` | `true` | FP16 inference (Ampere tensor cores) |
|
||||
| `yolo_img_size` | `320` | inference image size |
|
||||
| `tracked_classes` | 19 COCO classes | filter for relevant detections |
|
||||
|
||||
**`Config/config_Camera.json`**: `424x240 @ 15 fps`, `JPEG quality 70`.
|
||||
**`Config/config_Voice.json`**: see section 6 below.
|
||||
**`Config/config_Network.json`**: Jetson eth0/wlan0 IPs, WebSocket port.
|
||||
|
||||
---
|
||||
|
||||
@ -61,20 +106,28 @@ Defined at top of `marcus_yolo.py`:
|
||||
|
||||
### Setup
|
||||
|
||||
The bind is no longer an import-time side effect. It runs inside `init_zmq()`, called once by `init_brain()` from the main process. Children (e.g. the LiDAR SLAM worker spawned via `multiprocessing.spawn`) can re-import `API.zmq_api` without rebinding.
|
||||
|
||||
```python
|
||||
ctx = zmq.Context()
|
||||
sock = ctx.socket(zmq.PUB)
|
||||
sock.bind("tcp://127.0.0.1:5556")
|
||||
time.sleep(0.5)
|
||||
# API/zmq_api.py — bind happens here, not at module import
|
||||
def init_zmq() -> zmq.Socket:
|
||||
global ctx, sock
|
||||
if sock is not None:
|
||||
return sock # idempotent
|
||||
ctx = zmq.Context()
|
||||
sock = ctx.socket(zmq.PUB)
|
||||
sock.bind(f"tcp://{ZMQ_HOST}:{ZMQ_PORT}")
|
||||
time.sleep(0.5) # let SUBs attach
|
||||
return sock
|
||||
```
|
||||
|
||||
### `send_vel(vx, vy, vyaw)`
|
||||
|
||||
Send velocity command to Holosoma.
|
||||
Send velocity command to Holosoma. Raises `RuntimeError` if `init_zmq()` wasn't called.
|
||||
|
||||
```python
|
||||
def send_vel(vx: float = 0.0, vy: float = 0.0, vyaw: float = 0.0):
|
||||
sock.send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
|
||||
_ensure_sock().send_string(json.dumps({"vel": {"vx": vx, "vy": vy, "vyaw": vyaw}}))
|
||||
```
|
||||
|
||||
| Parameter | Unit | Safe range | Effect |
|
||||
@ -661,14 +714,17 @@ from unitree_sdk2py.g1.arm.g1_arm_action_client import G1ArmActionClient # Arm
|
||||
|
||||
```
|
||||
STARTUP:
|
||||
Tab 1: source ~/.holosoma_deps/miniconda3/bin/activate hsinference
|
||||
cd ~/holosoma && sudo jetson_clocks
|
||||
Tab 1 (hsinference env): Holosoma locomotion policy
|
||||
python3 run_policy.py inference:g1-29dof-loco \
|
||||
--task.velocity-input zmq --task.state-input zmq --task.interface eth0
|
||||
|
||||
Tab 2: ollama serve &
|
||||
/home/unitree/miniconda3/envs/marcus/bin/python3 ~/Models_marcus/marcus_llava.py
|
||||
(YOLO starts automatically — no Tab 3 needed)
|
||||
Tab 2: ollama serve > /tmp/ollama.log 2>&1 &
|
||||
sleep 3
|
||||
|
||||
Tab 3 (marcus env): conda activate marcus && cd ~/Marcus && python3 run_marcus.py
|
||||
(YOLO + voice + LiDAR all start automatically per subsystems flags)
|
||||
|
||||
WAKE WORD: "Sanad"
|
||||
|
||||
COMMANDS:
|
||||
walk forward · turn right · turn left · move back
|
||||
@ -704,4 +760,74 @@ SAFETY:
|
||||
|
||||
---
|
||||
|
||||
## 15. Voice API (mic + TTS + STT)
|
||||
|
||||
New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack.
|
||||
|
||||
### Mic — `Voice.builtin_mic.BuiltinMic`
|
||||
|
||||
Captures the G1's on-board array microphone over UDP multicast. No USB mic required. 16 kHz mono int16 PCM natively; no resampling needed.
|
||||
|
||||
```python
|
||||
from Voice.builtin_mic import BuiltinMic
|
||||
mic = BuiltinMic(group="239.168.123.161", port=5555, buf_max=64_000)
|
||||
mic.start()
|
||||
try:
|
||||
pcm = mic.read_chunk(1024) # 512 samples, ~32 ms, int16 mono
|
||||
# or
|
||||
pcm = mic.read_seconds(3.0)
|
||||
finally:
|
||||
mic.stop()
|
||||
```
|
||||
|
||||
Config under `config_Voice.json::mic_udp`.
|
||||
|
||||
### TTS — `Voice.builtin_tts.BuiltinTTS`
|
||||
|
||||
Wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker`. English only — refuses non-ASCII input.
|
||||
|
||||
```python
|
||||
from Voice.builtin_tts import BuiltinTTS
|
||||
tts = BuiltinTTS(audio_client, default_speaker_id=0)
|
||||
tts.speak("Hello, I am Sanad", block=True) # synth + play on G1 body speaker
|
||||
```
|
||||
|
||||
Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
|
||||
|
||||
### Wake + command loop — `Voice.marcus_voice.VoiceModule`
|
||||
|
||||
Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands.
|
||||
|
||||
```python
|
||||
from API.audio_api import AudioAPI
|
||||
from Voice.marcus_voice import VoiceModule
|
||||
|
||||
def on_command(text, lang):
|
||||
print(f"heard: {text}")
|
||||
|
||||
audio = AudioAPI()
|
||||
voice = VoiceModule(audio, on_command=on_command)
|
||||
voice.start() # background thread
|
||||
# ... later ...
|
||||
voice.stop()
|
||||
```
|
||||
|
||||
Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`.
|
||||
|
||||
### AudioAPI — `API.audio_api.AudioAPI`
|
||||
|
||||
Orchestration layer. Owns the `AudioClient`, manages mute/unmute, exposes a clean `speak` + `record` API.
|
||||
|
||||
```python
|
||||
from API.audio_api import AudioAPI
|
||||
audio = AudioAPI()
|
||||
audio.speak("Hello") # English only; non-ASCII returns early
|
||||
pcm = audio.record(seconds=5) # int16 mono 16 kHz — uses BuiltinMic
|
||||
audio.play_pcm(pcm) # raw PCM playback via Unitree RPC
|
||||
```
|
||||
|
||||
Config: `config_Voice.json::tts.backend = "builtin_ttsmaker"`, `mic.backend = "builtin_udp"` (or `"pactl_parec"` to fall back to Hollyland).
|
||||
|
||||
---
|
||||
|
||||
*Marcus — YS Lootah Technology | Kassam | April 2026*
|
||||
|
||||
Binary file not shown.
Binary file not shown.
@ -1,20 +1,39 @@
|
||||
# Marcus — System Architecture
|
||||
|
||||
**Project**: Marcus | YS Lootah Technology
|
||||
**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX (16GB)
|
||||
**Updated**: 2026-04-06
|
||||
**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB
|
||||
**Robot persona**: **Sanad** (wake word + self-intro; project code still lives under `Marcus/`)
|
||||
**Updated**: 2026-04-21
|
||||
|
||||
---
|
||||
|
||||
## Recent deltas (since 2026-04-06)
|
||||
|
||||
- **GPU-only YOLO** — `_resolve_device()` raises `RuntimeError` if CUDA is missing. `yolo_device=cuda`, `yolo_half=true` by default.
|
||||
- **Ollama compute-graph caps** — `num_batch=128`, `num_ctx=2048` in `config_Brain.json` (otherwise llama.cpp OOMs on the 16 GB Jetson).
|
||||
- **`num_predict_main: 120`** (was 200) — saves ~400-600 ms per open-ended command.
|
||||
- **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
|
||||
- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic.
|
||||
- **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
|
||||
- **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`).
|
||||
- **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
|
||||
- **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
|
||||
- **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
|
||||
- **Robot persona = "Sanad"** — wake words, prompts, banner, and self-intro all use "Sanad". Project identity ("Marcus") remains in file names, class names, directory, logs.
|
||||
|
||||
See `Doc/environment.md` for the verified Jetson software stack and `Doc/pipeline.md` for the end-to-end data flow.
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Marcus is a fully offline humanoid robot AI system. The brain runs on Jetson Orin NX with no cloud dependencies. It uses vision-language models (Qwen2.5-VL via Ollama) for understanding commands, YOLO for real-time object detection, dead reckoning for position tracking, and persistent memory across sessions.
|
||||
Marcus is a mostly-offline humanoid robot AI system. The brain runs on Jetson Orin NX using a local vision-language model (Qwen2.5-VL via Ollama) for open-ended commands, YOLOv8m for real-time object detection (CUDA + FP16), dead reckoning + optional ROS2 odometry for pose, Livox Mid-360 LiDAR + a custom SLAM worker for mapping, and persistent memory across sessions.
|
||||
|
||||
Two operating modes:
|
||||
- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson
|
||||
- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients
|
||||
- **Terminal mode** (`run_marcus.py`) — direct keyboard control on the Jetson. Voice subsystem runs alongside by default.
|
||||
- **Server mode** (`Server/marcus_server.py`) — WebSocket server allowing remote CLI or GUI clients.
|
||||
|
||||
Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control.
|
||||
Both modes use the **same brain** — identical command processing, same YOLO, same memory, same movement control. Voice, LiDAR, image-search and autonomous-patrol are gated behind `config_Brain.json::subsystems` flags.
|
||||
|
||||
---
|
||||
|
||||
@ -28,14 +47,14 @@ Marcus/
|
||||
├── Core/ # Foundation layer — no external deps
|
||||
│ ├── env_loader.py # Reads .env, resolves PROJECT_ROOT
|
||||
│ ├── config_loader.py # load_config(name) → reads Config/config_{name}.json
|
||||
│ ├── Logger.py # Logging engine (file-based, no console output)
|
||||
│ ├── log_backend.py # Logging engine (file-based, no console output) — was Logger.py
|
||||
│ └── logger.py # Project wrapper: log(), log_and_print(), get_logger()
|
||||
│
|
||||
├── Config/ # ALL configuration — one JSON per module
|
||||
│ ├── config_ZMQ.json # ZMQ host, port, stop params
|
||||
│ ├── config_Camera.json # RealSense resolution, fps, quality
|
||||
│ ├── config_Brain.json # Ollama model, prompts, num_predict values
|
||||
│ ├── config_Vision.json # YOLO model path, confidence, tracked classes
|
||||
│ ├── config_Brain.json # Ollama model, prompts, num_predict, num_batch/ctx, subsystems
|
||||
│ ├── config_Vision.json # YOLO model path, device=cuda, half=true, confidence, tracked classes
|
||||
│ ├── config_Navigation.json # move_map, goal aliases, YOLO goal classes
|
||||
│ ├── config_Patrol.json # patrol duration, proximity threshold
|
||||
│ ├── config_Arm.json # arm actions, aliases, availability flag
|
||||
@ -43,17 +62,26 @@ Marcus/
|
||||
│ ├── config_Memory.json # session/places paths
|
||||
│ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports
|
||||
│ ├── config_ImageSearch.json # search defaults
|
||||
│ └── marcus_prompts.yaml # All LLaVA/Qwen prompts (main, goal, patrol, talk, verify)
|
||||
│ ├── config_Voice.json # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port
|
||||
│ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params
|
||||
│ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify)
|
||||
│
|
||||
├── API/ # Interface layer — one file per subsystem
|
||||
│ ├── zmq_api.py # ZMQ PUB socket: send_vel(), gradual_stop(), send_cmd()
|
||||
│ ├── zmq_api.py # ZMQ PUB socket: init_zmq(), send_vel(), gradual_stop(), send_cmd()
|
||||
│ ├── camera_api.py # RealSense thread: start/stop_camera(), get_frame()
|
||||
│ ├── llava_api.py # LLaVA queries: call_llava(), ask(), ask_goal(), ask_patrol()
|
||||
│ ├── yolo_api.py # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()...
|
||||
│ ├── llava_api.py # Qwen2.5-VL queries via Ollama: call_llava(), ask(), ask_goal()…
|
||||
│ ├── yolo_api.py # YOLO interface: init_yolo(), yolo_sees(), yolo_summary()…
|
||||
│ ├── odometry_api.py # Odometry wrapper: init_odometry(), get_position()
|
||||
│ ├── memory_api.py # Memory wrapper: init_memory(), log_cmd(), place_save/goto()
|
||||
│ ├── arm_api.py # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES
|
||||
│ └── imgsearch_api.py # Image search wrapper: init_imgsearch(), get_searcher()
|
||||
│ ├── arm_api.py # Arm gestures: do_arm(), ARM_ACTIONS, ALL_ARM_NAMES (stub)
|
||||
│ ├── imgsearch_api.py # Image search wrapper: init_imgsearch(), get_searcher()
|
||||
│ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
|
||||
│ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
|
||||
│
|
||||
├── Voice/ # Mic + TTS + wake-word STT
|
||||
│ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555
|
||||
│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id)
|
||||
│ └── marcus_voice.py # VoiceModule — Whisper tiny (wake) + small (command) state machine
|
||||
│
|
||||
├── Brain/ # Decision logic — imports ONLY from API/
|
||||
│ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal()
|
||||
@ -127,39 +155,40 @@ Marcus/
|
||||
│ Server/marcus_server.py (WebSocket) │
|
||||
└──────────────────┬──────────────────────────────┘
|
||||
│
|
||||
┌──────────────────▼──────────────────────────────┐
|
||||
│ Brain Layer │
|
||||
│ marcus_brain.py — init_brain() │
|
||||
│ — process_command(cmd) │
|
||||
│ command_parser.py — 14 regex local commands │
|
||||
│ executor.py — execute LLaVA decisions │
|
||||
│ marcus_memory.py — session + place memory │
|
||||
└──────────────────┬──────────────────────────────┘
|
||||
┌──────────────────▼──────────────────────────────────┐
|
||||
│ Brain Layer │
|
||||
│ marcus_brain.py — init_brain() / process_command │
|
||||
│ command_parser.py — regex-table local commands │
|
||||
│ executor.py — execute Qwen-VL decisions │
|
||||
│ marcus_memory.py — session + place memory │
|
||||
└──────────────────┬──────────────────────────────────┘
|
||||
│ imports only from API/
|
||||
┌──────────────────▼──────────────────────────────┐
|
||||
│ API Layer │
|
||||
│ zmq_api camera_api llava_api │
|
||||
│ yolo_api odometry_api memory_api │
|
||||
│ arm_api imgsearch_api │
|
||||
└──────────────────┬──────────────────────────────┘
|
||||
│ wraps
|
||||
┌──────────────────▼──────────────────────────────┐
|
||||
│ Navigation / Vision │
|
||||
│ goal_nav.py marcus_yolo.py │
|
||||
│ patrol.py marcus_imgsearch.py │
|
||||
│ marcus_odometry.py │
|
||||
└──────────────────┬──────────────────────────────┘
|
||||
│
|
||||
┌──────────────────▼──────────────────────────────┐
|
||||
│ Core Layer │
|
||||
│ env_loader.py config_loader.py │
|
||||
│ Logger.py logger.py │
|
||||
└──────────────────┬──────────────────────────────┘
|
||||
┌──────────────────▼──────────────────────────────────┐
|
||||
│ API Layer │
|
||||
│ zmq_api camera_api llava_api audio_api │
|
||||
│ yolo_api odometry_api memory_api imgsearch_api │
|
||||
│ arm_api lidar_api │
|
||||
└──────────────┬───────────────────────┬──────────────┘
|
||||
│ wraps │ wraps
|
||||
┌──────────────▼───────────┐ ┌────────▼────────────────┐
|
||||
│ Navigation / Vision │ │ Voice │
|
||||
│ goal_nav.py │ │ builtin_mic.py │
|
||||
│ patrol.py │ │ builtin_tts.py │
|
||||
│ marcus_odometry.py │ │ marcus_voice.py │
|
||||
│ marcus_yolo.py │ │ (Whisper + TtsMaker) │
|
||||
│ marcus_imgsearch.py │ └──────────┬──────────────┘
|
||||
└──────────────┬───────────┘ │
|
||||
│ │
|
||||
┌──────────────▼─────────────────────────▼────────────┐
|
||||
│ Core Layer │
|
||||
│ env_loader.py config_loader.py │
|
||||
│ log_backend.py logger.py │
|
||||
└──────────────────┬──────────────────────────────────┘
|
||||
│ reads
|
||||
┌──────────────────▼──────────────────────────────┐
|
||||
│ Config / .env │
|
||||
│ 11 JSON files + marcus_prompts.yaml │
|
||||
└─────────────────────────────────────────────────┘
|
||||
┌──────────────────▼──────────────────────────────────┐
|
||||
│ Config / .env │
|
||||
│ 13 JSON files + marcus_prompts.yaml │
|
||||
└──────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Rule**: Brain never imports from Vision/ or Navigation/ directly. It goes through the API layer.
|
||||
@ -176,11 +205,11 @@ Reads `.env` from the project root to resolve `PROJECT_ROOT`. Uses a minimal bui
|
||||
#### `config_loader.py` (30 lines)
|
||||
`load_config(name)` reads `Config/config_{name}.json` and caches the result. All modules call this instead of hardcoding constants. Also provides `config_path(relative)` to resolve relative paths (e.g., `"Models/yolov8m.pt"`) to absolute paths from PROJECT_ROOT.
|
||||
|
||||
#### `Logger.py` (186 lines)
|
||||
Full logging engine from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery.
|
||||
#### `log_backend.py` (186 lines, was `Logger.py`)
|
||||
Full logging engine ported from AI_Photographer. File-based only (no console output by default). Creates per-module log files in `logs/`. Handles write permission fallbacks, log name normalization, and corrupt log recovery. Renamed from `Logger.py` on 2026-04-21 to eliminate a case-only collision with `logger.py` that prevented the repo from cloning on case-insensitive filesystems (macOS/Windows).
|
||||
|
||||
#### `logger.py` (51 lines)
|
||||
Project wrapper around `Logger.py`. Provides:
|
||||
Project wrapper around `log_backend.Logs`. Provides:
|
||||
- `log(message, level, module)` — write to `logs/{module}.log`
|
||||
- `log_and_print(message, level, module)` — write + print
|
||||
- `get_logger(module)` — get configured Logs instance
|
||||
@ -191,12 +220,15 @@ Project wrapper around `Logger.py`. Provides:
|
||||
|
||||
Each API file wraps one subsystem. They read their own config via `load_config()`, handle import errors gracefully with fallback stubs, and export clean public functions.
|
||||
|
||||
#### `zmq_api.py` (49 lines)
|
||||
Creates a ZMQ PUB socket on startup (binds to `tcp://127.0.0.1:{zmq_port}`). Holosoma's RL policy connects to this socket as SUB and receives velocity commands at 50Hz.
|
||||
#### `zmq_api.py` (~75 lines)
|
||||
Holds the ZMQ PUB socket used to drive Holosoma at 50 Hz. **The bind is not a module import side effect any more** — it runs only when `init_zmq()` is called from the main (parent) process. This lets the LiDAR SLAM worker (spawned via `multiprocessing.spawn`) re-import the module without rebinding port 5556 and crashing.
|
||||
|
||||
**Exports:**
|
||||
- `init_zmq()` — idempotent bind, called once by `init_brain()`
|
||||
- `send_vel(vx, vy, vyaw)` — send velocity to Holosoma
|
||||
- `gradual_stop()` — 20 zero-velocity messages over 1 second
|
||||
- `send_cmd(cmd)` — Holosoma state machine (`start` / `walk` / `stand` / `stop`)
|
||||
- `get_socket()` — access the bound socket (for odometry to reuse)
|
||||
- `send_cmd(cmd)` — send state command: "start", "walk", "stand", "stop"
|
||||
- `get_socket()` — return the shared PUB socket (for odometry to reuse)
|
||||
- `MOVE_MAP` — direction-to-velocity lookup: `{"forward": (0.3, 0, 0), "left": (0, 0, 0.3), ...}`
|
||||
@ -440,6 +472,37 @@ Supports text-only search (no reference image) using hint description.
|
||||
|
||||
---
|
||||
|
||||
### Voice/
|
||||
|
||||
Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
|
||||
|
||||
#### `builtin_mic.py` (~180 lines, new 2026-04-21)
|
||||
Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer.
|
||||
|
||||
**Exports:**
|
||||
- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent)
|
||||
- `start()` / `stop()` — socket lifecycle
|
||||
- `read_chunk(n)` — pull exactly `n` bytes (blocks up to `read_timeout`, pads silence otherwise)
|
||||
- `read_seconds(s)` — convenience for "record `s` seconds"
|
||||
- `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
|
||||
|
||||
#### `builtin_tts.py` (~70 lines, new 2026-04-21)
|
||||
Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone).
|
||||
|
||||
**Exports:**
|
||||
- `BuiltinTTS(audio_client, default_speaker_id=0)` — init
|
||||
- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
|
||||
|
||||
#### `marcus_voice.py` (~340 lines, rewired 2026-04-21)
|
||||
Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()` → `BuiltinTTS`.
|
||||
|
||||
**Exports:**
|
||||
- `VoiceModule(audio_api, on_command=cb)` — init
|
||||
- `start()` — spawn background thread
|
||||
- `stop()` — graceful teardown
|
||||
|
||||
---
|
||||
|
||||
### Server/
|
||||
|
||||
#### `marcus_server.py` (224 lines)
|
||||
|
||||
@ -1,15 +1,16 @@
|
||||
# Marcus — Control & Startup Guide
|
||||
|
||||
**Updated**: 2026-04-06
|
||||
**Robot persona:** Sanad (wake word + self-intro; project code lives under `Marcus/`)
|
||||
**Updated**: 2026-04-21
|
||||
|
||||
---
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Prerequisites (Jetson Orin NX)
|
||||
### Prerequisites (Jetson Orin NX, JetPack 5.1.1)
|
||||
|
||||
```bash
|
||||
# Terminal 1 — Start Holosoma (locomotion policy)
|
||||
# Terminal 1 — Start Holosoma (locomotion policy, in hsinference env)
|
||||
source ~/.holosoma_deps/miniconda3/bin/activate hsinference
|
||||
cd ~/holosoma
|
||||
~/.holosoma_deps/miniconda3/envs/hsinference/bin/python3 \
|
||||
@ -19,28 +20,46 @@ cd ~/holosoma
|
||||
--task.velocity-input zmq \
|
||||
--task.state-input zmq \
|
||||
--task.interface eth0
|
||||
|
||||
# Terminal 2 — Ollama server (leave running)
|
||||
ollama serve > /tmp/ollama.log 2>&1 &
|
||||
sleep 3
|
||||
ollama list # confirm qwen2.5vl:3b present
|
||||
```
|
||||
|
||||
### Option A — Terminal Mode (on Jetson)
|
||||
|
||||
```bash
|
||||
# Terminal 2 — Start Marcus Brain
|
||||
conda activate Marcus
|
||||
ollama serve & sleep 3
|
||||
# Terminal 3 — Start Marcus Brain
|
||||
conda activate marcus
|
||||
cd ~/Marcus
|
||||
python3 run_marcus.py
|
||||
```
|
||||
|
||||
Direct keyboard control. All commands typed locally.
|
||||
Direct keyboard control + voice input (say **"Sanad"** to wake). Expected banner on boot:
|
||||
|
||||
```
|
||||
================================================
|
||||
SANAD AI BRAIN — READY
|
||||
================================================
|
||||
model : qwen2.5vl:3b
|
||||
yolo : True
|
||||
odometry : True
|
||||
memory : True
|
||||
lidar : True
|
||||
voice : True
|
||||
camera : 424x240@15
|
||||
```
|
||||
|
||||
### Option B — Server + Client (remote)
|
||||
|
||||
```bash
|
||||
# Terminal 2 (Jetson) — Start Server
|
||||
# Terminal 3 (Jetson) — Start Server
|
||||
conda activate marcus
|
||||
cd ~/Marcus
|
||||
python3 -m Server.marcus_server
|
||||
|
||||
# Terminal 3 (Workstation) — Connect Client
|
||||
# Terminal 4 (Workstation) — Connect Client
|
||||
cd ~/Robotics_workspace/yslootahtech/Project/Marcus
|
||||
python3 -m Client.marcus_cli
|
||||
```
|
||||
@ -58,6 +77,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
|
||||
|
||||
---
|
||||
|
||||
## Voice
|
||||
|
||||
- **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`)
|
||||
- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed.
|
||||
- **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally.
|
||||
- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
|
||||
- **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command.
|
||||
|
||||
Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker.
|
||||
|
||||
To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster.
|
||||
|
||||
---
|
||||
|
||||
## Command Reference
|
||||
|
||||
### Movement
|
||||
@ -75,17 +108,17 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
|
||||
### Vision
|
||||
| Command | Action |
|
||||
|---------|--------|
|
||||
| `what do you see` | LLaVA describes camera view |
|
||||
| `describe the room` | LLaVA scene description |
|
||||
| `is anyone here` | LLaVA person check |
|
||||
| `what do you see` | Qwen2.5-VL describes camera view |
|
||||
| `describe the room` | Qwen2.5-VL scene description |
|
||||
| `is anyone here` | Qwen2.5-VL person check |
|
||||
| `yolo` | Show YOLO detection status |
|
||||
|
||||
### Goal Navigation
|
||||
| Command | Action |
|
||||
|---------|--------|
|
||||
| `goal/ stop when you see a person` | YOLO fast search + stop |
|
||||
| `goal/ find a laptop` | YOLO + LLaVA search |
|
||||
| `goal/ stop when you see a guy holding a phone` | YOLO + LLaVA compound verification |
|
||||
| `goal/ find a laptop` | YOLO + Qwen-VL search |
|
||||
| `goal/ stop when you see a guy holding a phone` | YOLO + Qwen-VL compound verification |
|
||||
| `find a person` | Auto-detected as goal (no prefix needed) |
|
||||
| `look for a bottle` | Auto-detected as goal |
|
||||
|
||||
@ -106,7 +139,7 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
|
||||
| `patrol` | Autonomous patrol (prompts for duration) |
|
||||
| `patrol: door → desk → exit` | Named waypoint patrol |
|
||||
|
||||
### Image Search
|
||||
### Image Search (requires `subsystems.imgsearch: true`)
|
||||
| Command | Action |
|
||||
|---------|--------|
|
||||
| `search/ /path/to/photo.jpg` | Find target from reference image |
|
||||
@ -122,11 +155,20 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
|
||||
| `last session` | Previous session summary |
|
||||
| `session summary` | Current session stats |
|
||||
|
||||
### Autonomous Mode
|
||||
| Command | Action |
|
||||
|---------|--------|
|
||||
| `auto on` | Start autonomous exploration |
|
||||
| `auto off` | Stop |
|
||||
| `auto status` | Current step / observations |
|
||||
| `auto save` | Snapshot observations to disk |
|
||||
|
||||
### System
|
||||
| Command | Action |
|
||||
|---------|--------|
|
||||
| `help` | Command reference |
|
||||
| `example` | Usage examples |
|
||||
| `lidar` / `lidar status` | SLAM engine pose + health |
|
||||
| `q` / `quit` | Shutdown |
|
||||
|
||||
### Client-Only Commands (CLI)
|
||||
@ -139,35 +181,43 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
|
||||
|
||||
---
|
||||
|
||||
## Subsystem flags (`Config/config_Brain.json`)
|
||||
|
||||
Control what initializes at boot. Defaults:
|
||||
|
||||
```jsonc
|
||||
"subsystems": {
|
||||
"lidar": true,
|
||||
"voice": true,
|
||||
"imgsearch": false,
|
||||
"autonomous": true
|
||||
}
|
||||
```
|
||||
|
||||
Set any to `false` to skip that subsystem's init. Boot time drops roughly:
|
||||
- `voice: false` → ~2 s faster (no Whisper model load)
|
||||
- `lidar: false` → ~1 s faster (no SLAM subprocess spawn)
|
||||
- `imgsearch: false` → already the default; re-enable only when you need `search/ …`
|
||||
- `autonomous: false` → minor, but removes the AutonomousMode init
|
||||
|
||||
---
|
||||
|
||||
## Network Configuration
|
||||
|
||||
| Interface | IP | Use |
|
||||
|-----------|-----|------|
|
||||
| `eth0` | 192.168.123.164 | Robot internal network (Jetson - G1 - LiDAR) |
|
||||
| `wlan0` | 10.255.254.86 | Office WiFi (Jetson - Workstation) |
|
||||
| `eth0` | 192.168.123.164 | Robot internal network (Jetson ↔ G1 ↔ LiDAR) |
|
||||
| `wlan0` | 10.255.254.86 | Office WiFi (Jetson ↔ Workstation) |
|
||||
|
||||
| Service | Port | Protocol |
|
||||
|---------|------|----------|
|
||||
| Marcus WebSocket | 8765 | ws:// |
|
||||
| ZMQ Velocity | 5556 | tcp:// (PUB/SUB) |
|
||||
| Ollama API | 11434 | HTTP |
|
||||
| LiDAR | 192.168.123.120 | Livox Mid360 |
|
||||
| ZMQ velocity (→ Holosoma) | 5556 | tcp:// (PUB/SUB) |
|
||||
| Ollama API | 11434 | HTTP (localhost only) |
|
||||
| G1 audio multicast (mic) | 5555 | UDP multicast 239.168.123.161 |
|
||||
| Livox Mid-360 (LiDAR) | 192.168.123.120 | UDP (Livox SDK) |
|
||||
|
||||
All configurable in `Config/config_Network.json`.
|
||||
|
||||
---
|
||||
|
||||
## Subsystem Status
|
||||
|
||||
On startup, the server/brain shows:
|
||||
```
|
||||
YOLO : active (19 tracked classes, CPU, yolov8m.pt)
|
||||
Odometry : active (dead reckoning, +/-10cm)
|
||||
Memory : active (session_016_2026-04-06)
|
||||
Camera : 424x240@15 (RealSense D435I)
|
||||
LiDAR : ALIVE (Livox Mid360 at 192.168.123.120)
|
||||
Arms : pending (GR00T N1.5 not yet integrated)
|
||||
```
|
||||
Most values configurable in `Config/config_Network.json` and `config_Voice.json::mic_udp`.
|
||||
|
||||
---
|
||||
|
||||
@ -175,13 +225,15 @@ Arms : pending (GR00T N1.5 not yet integrated)
|
||||
|
||||
| Issue | Cause | Fix |
|
||||
|-------|-------|-----|
|
||||
| `ModuleNotFoundError: No module named 'Server'` | Wrong directory | `cd ~/Marcus` then run |
|
||||
| Robot doesn't move | Holosoma not running | Start Holosoma first (Terminal 1) |
|
||||
| Robot doesn't move | ZMQ port conflict | Only run one of Server or Brain, not both |
|
||||
| `Camera: {e} reconnecting` | USB bandwidth | Reduce to `low` profile |
|
||||
| LLaVA slow (>10s) | GPU VRAM full | Kill other GPU processes, or use `qwen2.5vl:3b` |
|
||||
| `YOLO not available` | ultralytics not installed | `pip install ultralytics` |
|
||||
| Client can't connect | Wrong IP or server not running | Check `status` command, verify IP |
|
||||
| Banner shows `SANAD AI BRAIN — READY` but nothing moves | Holosoma not running | Start Holosoma (Terminal 1) first |
|
||||
| `RuntimeError: CUDA not available` on boot | Wrong torch build on Jetson | See `Doc/environment.md` section 9.2 — reinstall the NVIDIA Jetson torch wheel |
|
||||
| `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
|
||||
| Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
|
||||
| `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast |
|
||||
| Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` |
|
||||
| Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
|
||||
| `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
|
||||
| Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |
|
||||
|
||||
---
|
||||
|
||||
@ -191,6 +243,7 @@ Arms : pending (GR00T N1.5 not yet integrated)
|
||||
|------|------|
|
||||
| Brain code | `~/Marcus/Brain/` |
|
||||
| Server | `~/Marcus/Server/marcus_server.py` |
|
||||
| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` |
|
||||
| Config | `~/Marcus/Config/` |
|
||||
| Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
|
||||
| YOLO model | `~/Marcus/Models/yolov8m.pt` |
|
||||
@ -199,3 +252,5 @@ Arms : pending (GR00T N1.5 not yet integrated)
|
||||
| Logs | `~/Marcus/logs/` |
|
||||
|
||||
See `Doc/architecture.md` for full project structure and file-by-file documentation.
|
||||
See `Doc/environment.md` for the verified Jetson software stack.
|
||||
See `Doc/pipeline.md` for the end-to-end data flow.
|
||||
|
||||
@ -1,10 +1,11 @@
|
||||
# Marcus — Environment & Version Reference
|
||||
|
||||
**Project**: Marcus | YS Lootah Technology
|
||||
**Robot persona**: Sanad (wake word + self-intro; codebase stays under `Marcus/`)
|
||||
**Hardware**: Unitree G1 EDU Humanoid (29 DOF) + Jetson Orin NX 16 GB
|
||||
**Deployment host**: `unitree@192.168.123.164` (hostname `ubuntu`)
|
||||
**Conda env**: `marcus`
|
||||
**Captured**: 2026-04-12
|
||||
**Captured**: 2026-04-12 (updated 2026-04-21)
|
||||
|
||||
This document is the canonical record of the verified GPU-accelerated software stack running on the Jetson Orin NX. It covers system software, Python environment, Marcus runtime dependencies, installation recipe, verification commands, and known quirks. Pair it with `architecture.md` (what the code does) and `controlling.md` (how to drive it).
|
||||
|
||||
@ -136,29 +137,23 @@ Captured from `importlib` on 2026-04-12, `marcus` env on the Jetson.
|
||||
|
||||
## 8. Marcus project modules — import status
|
||||
|
||||
All 16 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`:
|
||||
All 25 project modules import cleanly from the `marcus` env at `/home/unitree/Marcus`:
|
||||
|
||||
```
|
||||
OK Core.config_loader
|
||||
OK Core.env_loader
|
||||
OK Vision.marcus_yolo
|
||||
OK Vision.marcus_imgsearch
|
||||
OK API.llava_api
|
||||
OK API.yolo_api
|
||||
OK API.camera_api
|
||||
OK API.zmq_api
|
||||
OK API.imgsearch_api
|
||||
OK API.odometry_api
|
||||
OK API.memory_api
|
||||
OK API.arm_api
|
||||
OK Navigation.goal_nav
|
||||
OK Navigation.patrol
|
||||
OK Navigation.marcus_odometry
|
||||
OK Brain.marcus_brain
|
||||
OK Brain.marcus_memory
|
||||
OK Core.config_loader Core.env_loader
|
||||
OK Core.log_backend Core.logger
|
||||
OK Voice.builtin_mic Voice.builtin_tts Voice.marcus_voice
|
||||
OK Vision.marcus_yolo Vision.marcus_imgsearch
|
||||
OK API.llava_api API.yolo_api API.camera_api
|
||||
OK API.zmq_api API.imgsearch_api API.odometry_api
|
||||
OK API.memory_api API.arm_api API.audio_api
|
||||
OK Navigation.goal_nav Navigation.patrol Navigation.marcus_odometry
|
||||
OK Brain.marcus_brain Brain.marcus_memory Brain.command_parser
|
||||
OK Autonomous.marcus_autonomous
|
||||
```
|
||||
|
||||
Notable removals: `Voice/marcus_gemini_voice.py` deleted on 2026-04-21. `Core/Logger.py` renamed to `Core/log_backend.py`.
|
||||
|
||||
---
|
||||
|
||||
## 9. Installation recipe (reproducing this environment)
|
||||
@ -378,3 +373,7 @@ Config file (`Config/config_Vision.json`):
|
||||
| 2026-04-12 | Initial environment.md — full stack captured, GPU bring-up verified end to end. Steady-state YOLOv8m FPS on Orin NX measured at 21.9. Ollama Qwen2.5-VL verified at 100% GPU. |
|
||||
| 2026-04-12 | `Vision/marcus_yolo.py` rewired to load `config_Vision.json`, added `_resolve_device()` with hard-fail on missing CUDA (GPU-only policy). `API/yolo_api.py` updated to propagate `RuntimeError`. `Config/config_Vision.json` set `yolo_device=cuda`, `yolo_half=true`. |
|
||||
| 2026-04-12 | Installed NVIDIA Jetson torch `2.1.0a0+41361538.nv23.06` (replacing CPU-only PyPI `2.4.1`) + built torchvision `0.16.1` from source against it. Verified `nms device = cuda:0`. |
|
||||
| 2026-04-12 | Fixed llama.cpp compute-graph OOM on Jetson: added `num_batch=128` + `num_ctx=2048` caps in `Config/config_Brain.json`, propagated through `API/llava_api.py` and `Vision/marcus_imgsearch.py`. Qwen2.5-VL compute graph drops from ~7.5 GiB to ~1.8 GiB. |
|
||||
| 2026-04-21 | **Restructure**: moved ZMQ bind out of `API/zmq_api.py` import time into `init_zmq()`; fixes LiDAR SLAM worker spawn crash. Added loud GPU-requirement banner in `API/yolo_api.py`. Dropped `num_predict_main` 200→120. Made inner-loop sleeps in goal_nav/autonomous/imgsearch conditional. Renamed `Core/Logger.py` → `Core/log_backend.py` (case-collision fix). Updated `Doc/MARCUS_API.md` to current state. |
|
||||
| 2026-04-21 | **Voice restructure**: added `Voice/builtin_mic.py` (G1 array mic via UDP multicast `239.168.123.161:5555`) and `Voice/builtin_tts.py` (thin `AudioClient.TtsMaker` wrapper). Rewired `Voice/marcus_voice.py` to use BuiltinMic. Refactored `API/audio_api.py::speak()` to use BuiltinTTS — removed ~110 lines of edge-tts + pydub + Piper plumbing. Deleted `Voice/marcus_gemini_voice.py`. Added `subsystems.{lidar,voice,imgsearch,autonomous}` gate in `config_Brain.json::init_brain()`. |
|
||||
| 2026-04-21 | **Persona swap**: robot identifies as Sanad. Wake words `["sanad","sannad","sanat","sunnat"]`, `speaker.app_name="sanad"`, all Qwen prompts say "You are Sanad", banner reads `SANAD AI BRAIN — READY`, hardcoded self-intro says "I am Sanad". Project directory, class names, filenames, and `PROJECT_NAME=Marcus` env var unchanged. |
|
||||
|
||||
15
Doc/note.txt
15
Doc/note.txt
@ -38,15 +38,12 @@ rm ~/Robotics_workspace/yslootahtech/Project/Marcus_fine_tune/marcus-gguf/marcus
|
||||
|
||||
|
||||
|
||||
|
||||
https://ingrid789.github.io/SkillMimic/
|
||||
https://github.com/wyhuai/SkillMimic
|
||||
|
||||
https://vla-survey.github.io/
|
||||
|
||||
|
||||
|
||||
|
||||
https://github.com/AnjieCheng/NaVILA
|
||||
https://rchalyang.github.io/EgoVLA/
|
||||
https://github.com/RchalYang/EgoVLA_Release
|
||||
https://github.com/openvla/openvla
|
||||
https://github.com/unitreerobotics/unifolm-vla
|
||||
https://github.com/OpenDriveLab/WholebodyVLA
|
||||
|
||||
|
||||
|
||||
|
||||
187
Doc/pipeline.md
Normal file
187
Doc/pipeline.md
Normal file
@ -0,0 +1,187 @@
|
||||
# Marcus — End-to-End Pipeline
|
||||
|
||||
**Robot persona:** Sanad (wake word + self-intro)
|
||||
**Updated:** 2026-04-21
|
||||
|
||||
One map of every data path from sensor to motor, voice to speech. Cross-reference with `architecture.md` (what each file is) and `MARCUS_API.md` (function signatures).
|
||||
|
||||
---
|
||||
|
||||
## Boot sequence
|
||||
|
||||
`Brain/marcus_brain.py::init_brain()` — called once from `run_marcus.py` or `marcus_server.py`.
|
||||
|
||||
```
|
||||
run_marcus.py
|
||||
│
|
||||
▼
|
||||
init_brain()
|
||||
│
|
||||
├─ init_zmq() PUB bind tcp://127.0.0.1:5556 → Holosoma
|
||||
├─ start_camera() RealSense 424×240@15fps → shared _raw_frame
|
||||
├─ init_yolo(raw_frame, raw_lock) YOLOv8m CUDA FP16, 19 classes — background thread
|
||||
├─ init_odometry() ROS2 /dog_odom → dead reckoning fallback
|
||||
├─ init_memory() loads Data/Brain/Sessions/session_NNN/
|
||||
│
|
||||
├─ if subsystems.lidar: init_lidar() multiprocessing spawn SLAM_worker
|
||||
├─ if subsystems.imgsearch: init_imgsearch() (off by default)
|
||||
├─ if subsystems.autonomous: AutonomousMode() patrol state machine
|
||||
│
|
||||
├─ send_cmd("start") + 0.5s + send_cmd("walk") + 0.5s Holosoma handshake
|
||||
│
|
||||
├─ if subsystems.voice: _init_voice() ▼ voice pipeline below
|
||||
└─ _warmup_llava() first Qwen2.5-VL inference
|
||||
"SANAD AI BRAIN — READY"
|
||||
```
|
||||
|
||||
Subsystem flags live in `config_Brain.json::subsystems`. Current defaults:
|
||||
|
||||
```json
|
||||
"subsystems": { "lidar": true, "voice": true, "imgsearch": false, "autonomous": true }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Voice pipeline (when `subsystems.voice = true`)
|
||||
|
||||
```
|
||||
G1 body mic (array)
|
||||
└─ UDP multicast 239.168.123.161:5555 ── int16 mono 16 kHz PCM
|
||||
▼
|
||||
Voice/builtin_mic.py::BuiltinMic
|
||||
ring buffer (64 KB) + read_chunk(n)
|
||||
▼
|
||||
Voice/marcus_voice.py::VoiceModule (IDLE → WAKE_HEARD → PROCESSING → SPEAKING)
|
||||
├─ IDLE : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…)
|
||||
├─ WAKE_HEARD : audio_api.speak("Listening") → G1 body speaker
|
||||
├─ PROCESSING : record-until-silence → Whisper small → transcribed text
|
||||
└─ on_command(text, "en")
|
||||
▼
|
||||
Brain/marcus_brain.py::process_command(text)
|
||||
├─ regex fast-path → Brain/command_parser.py::try_local_command()
|
||||
│ places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off
|
||||
└─ else → _handle_llava(text)
|
||||
├─ get_frame() (10×50 ms poll, no 1 s stall)
|
||||
├─ API/llava_api.py::ask(text, img)
|
||||
│ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
|
||||
│ → parse_json() → {actions, arm, speak, abort}
|
||||
└─ Brain/executor.py::execute(d)
|
||||
├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma
|
||||
├─ arm → API/arm_api.py (stub for now)
|
||||
└─ abort → gradual_stop()
|
||||
▼
|
||||
result["speak"] → audio_api.speak(reply)
|
||||
▼
|
||||
API/audio_api.py::speak(text, lang="en")
|
||||
├─ mute mic (flush BuiltinMic buffer)
|
||||
├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
|
||||
│ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only
|
||||
│ time.sleep(len(text) * 0.08)
|
||||
└─ unmute mic → back to IDLE
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Terminal / WebSocket command pipeline (same brain, skips voice)
|
||||
|
||||
```
|
||||
run_marcus.py stdin OR Server/marcus_server.py WebSocket
|
||||
▼
|
||||
Brain/marcus_brain.py::process_command(text)
|
||||
▼ (same parser → LLaVA → executor → ZMQ as above)
|
||||
▼
|
||||
result dict → stdout OR WebSocket reply frame
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Vision pipeline (continuous, consumed by brain on demand)
|
||||
|
||||
```
|
||||
RealSense D435 (USB)
|
||||
└─ 424×240 BGR 15 fps
|
||||
→ API/camera_api.py — shared _raw_frame (thread-safe)
|
||||
│ │
|
||||
│ └─ get_frame() → JPEG base64 on demand
|
||||
▼
|
||||
Vision/marcus_yolo.py (daemon thread)
|
||||
YOLOv8m @ cuda:0 FP16 imgsz=320
|
||||
→ _latest_detections (thread-safe list)
|
||||
yolo_sees / yolo_closest / yolo_summary / yolo_fps
|
||||
▼
|
||||
Navigation/goal_nav.py (fast YOLO check → Qwen-VL fallback)
|
||||
Autonomous/marcus_autonomous.py (patrol scan every N steps)
|
||||
Brain/marcus_brain.py (status / alerts)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Movement pipeline
|
||||
|
||||
```
|
||||
Brain/executor.py OR Brain/command_parser.py OR Navigation/*
|
||||
│ uses MOVE_MAP from config_Navigation.json
|
||||
▼
|
||||
API/zmq_api.py::send_vel(vx, vy, vyaw) JSON over ZMQ PUB (port 5556)
|
||||
▼
|
||||
Holosoma RL policy (separate process, hsinference env)
|
||||
▼
|
||||
G1 low-level joint commands over DDS/eth0
|
||||
▼
|
||||
29-DOF body motion
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## LiDAR pipeline (when `subsystems.lidar = true`)
|
||||
|
||||
```
|
||||
Livox Mid-360 (192.168.123.120, UDP)
|
||||
▼
|
||||
Lidar/SLAM_worker.py (multiprocessing.spawn subprocess — CUDA-safe spawn)
|
||||
├─ SLAM_engine, SLAM_Filter, SLAM_LoopClosure, SLAM_Submap, SLAM_NavRuntime
|
||||
├─ publishes pose + obstacle flags back to parent via Queue
|
||||
└─ writes occupancy grids to Data/Navigation/Maps/
|
||||
▼
|
||||
API/lidar_api.py (reads the queues, exposes:)
|
||||
├─ obstacle_ahead() → bool
|
||||
├─ get_lidar_status() → dict (pose, loc_state, frame age, FPS, ICP ms)
|
||||
└─ LIDAR_AVAILABLE
|
||||
▼
|
||||
Navigation/goal_nav.py rotation thread — pauses motion on obstacle_ahead()
|
||||
Brain/command_parser.py — responds to "lidar status" queries
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Knobs that control each stage
|
||||
|
||||
| Knob | Location | Effect |
|
||||
|---|---|---|
|
||||
| `subsystems.lidar` | config_Brain.json | SLAM subprocess on/off |
|
||||
| `subsystems.voice` | config_Brain.json | BuiltinMic + Whisper + TtsMaker loop on/off |
|
||||
| `subsystems.imgsearch` | config_Brain.json | image-guided search init on/off |
|
||||
| `subsystems.autonomous` | config_Brain.json | auto-patrol state machine init on/off |
|
||||
| `num_batch`, `num_ctx` | config_Brain.json | llama.cpp compute-graph size (128 / 2048 ≈ 1.8 GiB graph — **do not raise** on 16 GB Jetson) |
|
||||
| `num_predict_main` | config_Brain.json | 120 tokens max for the main JSON reply |
|
||||
| `yolo_device`, `yolo_half` | config_Vision.json | `cuda` / FP16 (hard-required; CPU not allowed) |
|
||||
| `mic.backend` | config_Voice.json | `builtin_udp` (G1 array) or `pactl_parec` (Hollyland fallback) |
|
||||
| `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
|
||||
| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) |
|
||||
| `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) |
|
||||
|
||||
---
|
||||
|
||||
## Per-command latency (estimated, post-fixes)
|
||||
|
||||
| Step | Typical | Notes |
|
||||
|---|---|---|
|
||||
| Wake-word detect | 200–500 ms | Whisper tiny on 2 s chunk |
|
||||
| Record until silence | 1–8 s | depends on user speech |
|
||||
| Whisper small STT | 500–1500 ms | once per command |
|
||||
| Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
|
||||
| Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
|
||||
| Executor + ZMQ send | <10 ms | fire-and-forget PUB |
|
||||
| TtsMaker playback | ~len(text) × 80 ms | synthesizes + plays on robot |
|
||||
|
||||
**Total wake → answer-playback:** ~**2.5–4 s** for a short vision question like "what do you see" (vs. 5–8 s with the pre-restructure edge-tts/Gemini overhead).
|
||||
@ -123,26 +123,36 @@ def navigate_to_goal(goal: str, max_steps: int = 0):
|
||||
reached = False
|
||||
try:
|
||||
for step in range(1, max_steps + 1):
|
||||
time.sleep(SCAN_INTERVAL)
|
||||
# Track whether real work happened this iteration. If it did,
|
||||
# the work itself already ate wall time — don't pay an extra
|
||||
# SCAN_INTERVAL nap on top.
|
||||
did_work = False
|
||||
|
||||
# --- YOLO fast check ---
|
||||
if yolo_target and yolo_sees(yolo_target):
|
||||
img_b64 = get_frame()
|
||||
did_work = True
|
||||
if condition:
|
||||
if not _verify_condition(yolo_target, condition, img_b64):
|
||||
print(f" [GoalNav] YOLO sees {yolo_target} but condition "
|
||||
f"'{condition}' not met — continuing")
|
||||
continue
|
||||
|
||||
print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
|
||||
log_detection(yolo_target, position="goal", distance="close")
|
||||
reached = True
|
||||
break
|
||||
# fall through to the sleep-skip path
|
||||
else:
|
||||
print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
|
||||
log_detection(yolo_target, position="goal", distance="close")
|
||||
reached = True
|
||||
break
|
||||
else:
|
||||
print(f" [GoalNav] YOLO confirmed '{yolo_target}' at step {step}")
|
||||
log_detection(yolo_target, position="goal", distance="close")
|
||||
reached = True
|
||||
break
|
||||
|
||||
# --- LLaVA fallback (less frequent — every few steps) ---
|
||||
if step >= MIN_STEPS and step % MIN_STEPS == 0:
|
||||
img_b64 = get_frame()
|
||||
if img_b64:
|
||||
did_work = True
|
||||
d = ask_goal(goal, img_b64)
|
||||
if d.get("reached"):
|
||||
print(f" [GoalNav] LLaVA says goal reached at step {step}")
|
||||
@ -152,6 +162,11 @@ def navigate_to_goal(goal: str, max_steps: int = 0):
|
||||
if speak:
|
||||
print(f" [GoalNav] LLaVA: {speak}")
|
||||
|
||||
# Only pay the scan interval when nothing happened this step.
|
||||
# If YOLO hit or LLaVA fired, they already took 50–1000 ms.
|
||||
if not did_work:
|
||||
time.sleep(SCAN_INTERVAL)
|
||||
|
||||
finally:
|
||||
rotating[0] = False
|
||||
rot_thread.join(timeout=1.0)
|
||||
|
||||
@ -59,7 +59,9 @@ except ImportError:
|
||||
# ══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
DEFAULT_MAX_STEPS = 60 # max rotation steps before giving up
|
||||
STEP_DELAY = 0.4 # seconds between YOLO checks
|
||||
STEP_DELAY = 0.15 # min gap between YOLO checks (was 0.4 — reduced
|
||||
# because the rotation thread paces motion already
|
||||
# and each LLaVA call is 600-1500 ms of real work)
|
||||
ROTATE_SPEED = 0.25 # rad/s rotation speed during search
|
||||
MIN_STEPS_WARMUP = 3 # skip first N steps (stale frame)
|
||||
MATCH_CONFIDENCE_THR = 0.6 # LLaVA confidence threshold (not used directly,
|
||||
|
||||
202
Voice/builtin_mic.py
Normal file
202
Voice/builtin_mic.py
Normal file
@ -0,0 +1,202 @@
|
||||
"""
|
||||
builtin_mic.py — G1 built-in microphone (UDP multicast capture)
|
||||
================================================================
|
||||
The G1 humanoid's on-board microphone is published by the Unitree firmware
|
||||
as an RTP-like UDP multicast stream on 239.168.123.161:5555, carrying
|
||||
16 kHz mono int16 PCM. Any host on the robot's 192.168.123.0/24 network
|
||||
can join the group and read the audio — no extra SDK call required.
|
||||
|
||||
This module intentionally has no dependency on pyaudio, pulseaudio, or the
|
||||
unitree_sdk2py package. Joining the multicast group is all that's needed.
|
||||
|
||||
Usage:
|
||||
from Voice.builtin_mic import BuiltinMic
|
||||
mic = BuiltinMic()
|
||||
mic.start()
|
||||
try:
|
||||
chunk = mic.read_chunk(1024) # 512 samples, 32 ms at 16 kHz
|
||||
...
|
||||
finally:
|
||||
mic.stop()
|
||||
|
||||
Ported from Project/Sanad/voice/audio_io.py (Sanad's production implementation).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import socket
|
||||
import struct
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
|
||||
DEFAULT_GROUP = "239.168.123.161"
|
||||
DEFAULT_PORT = 5555
|
||||
DEFAULT_BUF_MAX = 64_000 # ~2 s of 16 kHz mono int16
|
||||
DEFAULT_READ_TIMEOUT = 0.04 # 40 ms budget per read_chunk call
|
||||
SAMPLE_RATE = 16_000 # hardware rate — do not change
|
||||
|
||||
|
||||
def _find_g1_local_ip() -> str:
|
||||
"""
|
||||
Return the host IPv4 on the G1's internal 192.168.123.0/24 network.
|
||||
Required by IP_ADD_MEMBERSHIP so the kernel knows which NIC to join on.
|
||||
"""
|
||||
out = subprocess.run(
|
||||
["ip", "-4", "-o", "addr"], capture_output=True, text=True,
|
||||
).stdout
|
||||
for line in out.splitlines():
|
||||
for tok in line.split():
|
||||
if tok.startswith("192.168.123."):
|
||||
return tok.split("/")[0]
|
||||
raise RuntimeError(
|
||||
"BuiltinMic: no interface on 192.168.123.0/24 — "
|
||||
"host is not on the G1's internal network"
|
||||
)
|
||||
|
||||
|
||||
class BuiltinMic:
|
||||
"""
|
||||
G1 on-board microphone over UDP multicast.
|
||||
|
||||
Thread-safe: a background daemon thread receives datagrams into an
|
||||
internal ring buffer; `read_chunk(n)` pulls the next `n` bytes or
|
||||
blocks up to `read_timeout` before returning zeros.
|
||||
"""
|
||||
|
||||
sample_rate = SAMPLE_RATE
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
group: str = DEFAULT_GROUP,
|
||||
port: int = DEFAULT_PORT,
|
||||
buf_max: int = DEFAULT_BUF_MAX,
|
||||
read_timeout: float = DEFAULT_READ_TIMEOUT,
|
||||
):
|
||||
self._group = group
|
||||
self._port = port
|
||||
self._buf_max = buf_max
|
||||
self._read_timeout = read_timeout
|
||||
self._sock: Optional[socket.socket] = None
|
||||
self._buf = bytearray()
|
||||
self._lock = threading.Lock()
|
||||
self._running = False
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
|
||||
def start(self) -> None:
|
||||
if self._running:
|
||||
return
|
||||
local_ip = _find_g1_local_ip()
|
||||
self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
self._sock.bind(("", self._port))
|
||||
mreq = struct.pack(
|
||||
"4s4s",
|
||||
socket.inet_aton(self._group),
|
||||
socket.inet_aton(local_ip),
|
||||
)
|
||||
self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
|
||||
self._sock.settimeout(1.0)
|
||||
self._running = True
|
||||
self._thread = threading.Thread(
|
||||
target=self._recv_loop, daemon=True, name="builtin_mic_rx",
|
||||
)
|
||||
self._thread.start()
|
||||
print(f" [BuiltinMic] joined {self._group}:{self._port} on {local_ip}")
|
||||
|
||||
def _recv_loop(self) -> None:
|
||||
while self._running:
|
||||
try:
|
||||
data, _ = self._sock.recvfrom(4096)
|
||||
with self._lock:
|
||||
self._buf.extend(data)
|
||||
# ring-buffer: drop oldest when we'd exceed buf_max
|
||||
if len(self._buf) > self._buf_max:
|
||||
del self._buf[: len(self._buf) - self._buf_max]
|
||||
except socket.timeout:
|
||||
continue
|
||||
except Exception:
|
||||
if self._running:
|
||||
time.sleep(0.01)
|
||||
|
||||
def read_chunk(self, num_bytes: int) -> bytes:
|
||||
"""
|
||||
Return exactly `num_bytes` of 16 kHz mono int16 PCM.
|
||||
|
||||
Waits up to `read_timeout` for that many bytes to be available.
|
||||
If the buffer is still short after the timeout, returns whatever
|
||||
is available padded with silence. Never blocks forever.
|
||||
"""
|
||||
deadline = time.time() + self._read_timeout
|
||||
while time.time() < deadline:
|
||||
with self._lock:
|
||||
if len(self._buf) >= num_bytes:
|
||||
chunk = bytes(self._buf[:num_bytes])
|
||||
del self._buf[:num_bytes]
|
||||
return chunk
|
||||
time.sleep(0.003)
|
||||
with self._lock:
|
||||
avail = len(self._buf)
|
||||
if avail > 0:
|
||||
chunk = bytes(self._buf[:avail])
|
||||
del self._buf[:avail]
|
||||
return chunk + b"\x00" * (num_bytes - avail)
|
||||
return b"\x00" * num_bytes
|
||||
|
||||
def read_seconds(self, seconds: float) -> bytes:
|
||||
"""
|
||||
Convenience: capture `seconds` of audio and return as bytes.
|
||||
Blocks for the full duration (not a real-time producer).
|
||||
"""
|
||||
num_bytes = int(seconds * self.sample_rate * 2) # 2 bytes/sample (int16)
|
||||
out = bytearray()
|
||||
chunk_bytes = 1024
|
||||
while len(out) < num_bytes:
|
||||
out.extend(self.read_chunk(min(chunk_bytes, num_bytes - len(out))))
|
||||
return bytes(out)
|
||||
|
||||
def flush(self) -> None:
|
||||
"""Drop all buffered audio (e.g. after the robot spoke)."""
|
||||
with self._lock:
|
||||
self._buf.clear()
|
||||
|
||||
def stop(self) -> None:
|
||||
self._running = False
|
||||
if self._sock is not None:
|
||||
try:
|
||||
self._sock.close()
|
||||
except Exception:
|
||||
pass
|
||||
self._sock = None
|
||||
if self._thread is not None:
|
||||
self._thread.join(timeout=1.5)
|
||||
self._thread = None
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
# Standalone test — capture 3 s and print energy stats
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
|
||||
if __name__ == "__main__":
|
||||
import array
|
||||
|
||||
print("BuiltinMic standalone test — capturing 3 s from G1...")
|
||||
mic = BuiltinMic()
|
||||
mic.start()
|
||||
time.sleep(0.3) # let the receiver thread warm up
|
||||
raw = mic.read_seconds(3.0)
|
||||
mic.stop()
|
||||
|
||||
samples = array.array("h", raw)
|
||||
if not samples:
|
||||
print(" FAIL — got zero samples")
|
||||
else:
|
||||
mn = min(samples); mx = max(samples)
|
||||
mean_abs = sum(abs(s) for s in samples) / len(samples)
|
||||
print(f" samples={len(samples)} min={mn} max={mx} mean|s|={mean_abs:.0f}")
|
||||
if mean_abs > 30:
|
||||
print(" OK — mic is capturing audio")
|
||||
else:
|
||||
print(" WARN — signal very low, check G1 audio service is running")
|
||||
88
Voice/builtin_tts.py
Normal file
88
Voice/builtin_tts.py
Normal file
@ -0,0 +1,88 @@
|
||||
"""
|
||||
builtin_tts.py — Unitree G1 built-in TTS (English only)
|
||||
========================================================
|
||||
Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
|
||||
TTS engine synthesizes and plays directly through the body speaker — no
|
||||
internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.
|
||||
|
||||
Supported languages (firmware-side):
|
||||
English — works (Marcus uses this)
|
||||
Chinese — works (unused)
|
||||
Arabic — silently falls back to Chinese (unusable — we refuse these)
|
||||
|
||||
Signature:
|
||||
client.TtsMaker(text: str, speaker_id: int) -> int # 0 = success
|
||||
speaker_id ∈ {0, 1, 2} — different voice timbres
|
||||
|
||||
Usage:
|
||||
from Voice.builtin_tts import BuiltinTTS
|
||||
tts = BuiltinTTS(audio_client)
|
||||
tts.speak("Hello, I am Sanad", speaker_id=0)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
log = logging.getLogger("builtin_tts")
|
||||
|
||||
|
||||
class BuiltinTTS:
|
||||
"""Synchronous English-only TTS via the G1's on-board engine."""
|
||||
|
||||
# Rough playback duration per character — enough margin that `speak()`
|
||||
# returns after audio has actually finished on the robot.
|
||||
SECONDS_PER_CHAR = 0.08
|
||||
MIN_SECONDS = 1.5
|
||||
|
||||
def __init__(self, audio_client, default_speaker_id: int = 0):
|
||||
"""
|
||||
Args:
|
||||
audio_client : initialized unitree_sdk2py AudioClient
|
||||
default_speaker_id : 0, 1, or 2 (default voice timbre)
|
||||
"""
|
||||
self._client = audio_client
|
||||
self._default_speaker = default_speaker_id
|
||||
|
||||
def speak(
|
||||
self,
|
||||
text: str,
|
||||
speaker_id: Optional[int] = None,
|
||||
block: bool = True,
|
||||
) -> int:
|
||||
"""
|
||||
Play `text` on the G1 speaker via TtsMaker.
|
||||
|
||||
English-only by policy. Non-ASCII (Arabic) input is rejected rather
|
||||
than silently played back as Chinese. Returns the TtsMaker status
|
||||
code (0 = success) or -1 if input was rejected.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return -1
|
||||
|
||||
# Reject non-English. TtsMaker "falls back" by playing Arabic text
|
||||
# as Chinese phonemes — intelligible to nobody — so we refuse it
|
||||
# rather than surprise the operator.
|
||||
if any(ord(c) > 127 for c in text):
|
||||
log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
|
||||
return -1
|
||||
|
||||
sid = self._default_speaker if speaker_id is None else speaker_id
|
||||
log.info("[TtsMaker sid=%d] %s", sid, text[:80])
|
||||
|
||||
try:
|
||||
code = self._client.TtsMaker(text, sid)
|
||||
except Exception as e:
|
||||
log.error("TtsMaker call failed: %s", e)
|
||||
return -1
|
||||
|
||||
if block:
|
||||
# Estimate how long the G1 is going to take to finish speaking.
|
||||
# TtsMaker is fire-and-forget — we need to wait so the mic loop
|
||||
# knows when to unmute.
|
||||
duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
|
||||
time.sleep(duration)
|
||||
|
||||
return code
|
||||
@ -1,608 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Voice/marcus_gemini_voice.py — Marcus Gemini Live Voice Module v2
|
||||
==================================================================
|
||||
Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio.
|
||||
Uses G1 built-in speaker + Hollyland wireless mic.
|
||||
|
||||
Based on SanadVoice/gemini_interact architecture:
|
||||
- PyAudio for mic (not parec)
|
||||
- Echo suppression (silence when speaking)
|
||||
- Gemini VAD (automatic activity detection)
|
||||
- thinkingBudget=0 (no thinking text)
|
||||
- ASR buffering for full sentences
|
||||
- Vision routed to brain's Qwen camera
|
||||
|
||||
Usage:
|
||||
from Voice.marcus_gemini_voice import GeminiVoiceModule
|
||||
voice = GeminiVoiceModule(audio_api, on_transcript=callback)
|
||||
voice.start()
|
||||
"""
|
||||
|
||||
import array
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
|
||||
PROJECT_NAME = "Marcus"
|
||||
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
|
||||
|
||||
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
||||
os.makedirs(LOG_DIR, exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
log = logging.getLogger("gemini_voice")
|
||||
|
||||
|
||||
def load_config(name: str) -> dict:
|
||||
path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
|
||||
with open(path, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
# ─── CONFIGURATION ────────────────────────────────────────
|
||||
|
||||
API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8"
|
||||
MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
|
||||
URI = (
|
||||
"wss://generativelanguage.googleapis.com/ws/"
|
||||
"google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
|
||||
f"?key={API_KEY}"
|
||||
)
|
||||
|
||||
VOICE_NAME = "Charon"
|
||||
SEND_RATE = 16000
|
||||
RECEIVE_RATE = 24000
|
||||
CHUNK_SIZE = 512
|
||||
CHANNELS = 1
|
||||
|
||||
|
||||
def load_system_prompt():
|
||||
paths = [
|
||||
os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"),
|
||||
]
|
||||
for p in paths:
|
||||
if os.path.exists(p):
|
||||
with open(p, "r", encoding="utf-8-sig") as f:
|
||||
return f.read().strip()
|
||||
return (
|
||||
"You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. "
|
||||
"Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max."
|
||||
)
|
||||
|
||||
|
||||
# ─── AUDIO HELPERS ────────────────────────────────────────
|
||||
|
||||
def audio_energy(pcm: bytes) -> int:
|
||||
try:
|
||||
samples = array.array("h", pcm)
|
||||
if not samples:
|
||||
return 0
|
||||
return sum(abs(s) for s in samples) // len(samples)
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2)
|
||||
|
||||
|
||||
# ─── GEMINI VOICE MODULE ─────────────────────────────────
|
||||
|
||||
class GeminiVoiceModule:
|
||||
"""Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic."""
|
||||
|
||||
def __init__(self, audio_api, on_transcript=None):
|
||||
self._audio = audio_api
|
||||
self._on_transcript = on_transcript
|
||||
self._config = load_config("Voice")
|
||||
self._mic_source = getattr(audio_api, '_mic_source',
|
||||
self._config["mic"].get("source_index", "0"))
|
||||
|
||||
# State
|
||||
self.speaking = False
|
||||
self.interrupted = False
|
||||
self._running = False
|
||||
self._thread = None
|
||||
self._audio_queue = None # Created in async context
|
||||
|
||||
# Tuning
|
||||
self.MIN_THRESHOLD = 3000
|
||||
self.barge_in_threshold = self.MIN_THRESHOLD
|
||||
self.REQUIRED_LOUD_CHUNKS = 10
|
||||
self.PREBUFFER_CHUNKS = 2
|
||||
self.PLAYBACK_TIMEOUT = 0.25
|
||||
self.BARGE_IN_COOLDOWN = 0.7
|
||||
self.AI_SPEAK_GRACE = 0.20
|
||||
self.ECHO_GUARD_SEC = 0.8
|
||||
self.SPEAKING_ENERGY_GATE = 0.85
|
||||
self.SEND_SILENCE_WHEN_SPEAKING = True
|
||||
|
||||
# Timing
|
||||
self._ai_speaking_since = 0.0
|
||||
self._last_ai_audio_time = 0.0
|
||||
self._barge_in_block_until = 0.0
|
||||
self._ignore_input_until = 0.0
|
||||
|
||||
# ASR buffer
|
||||
self._asr_buf = ""
|
||||
self._asr_last_time = 0.0
|
||||
self.ASR_WINDOW_SEC = 2.0
|
||||
|
||||
# Find Hollyland mic PyAudio device index
|
||||
self._mic_device_idx = self._find_mic_device()
|
||||
|
||||
log.info("GeminiVoiceModule v2 initialized")
|
||||
|
||||
# ─── MIC DEVICE DETECTION ─────────────────────────────
|
||||
|
||||
def _find_mic_device(self) -> int:
|
||||
"""Find Hollyland wireless mic in PyAudio devices. Returns device index."""
|
||||
import pyaudio
|
||||
import ctypes
|
||||
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
|
||||
def _alsa_error_handler(filename, line, function, err, fmt):
|
||||
pass # suppress
|
||||
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
|
||||
try:
|
||||
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
|
||||
asound.snd_lib_error_set_handler(c_error_handler)
|
||||
except: pass # ALSA_suppress
|
||||
pa = pyaudio.PyAudio()
|
||||
try:
|
||||
# First: set PulseAudio default source to Hollyland
|
||||
subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
|
||||
subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
|
||||
|
||||
# Search for wireless mic by name
|
||||
for i in range(pa.get_device_count()):
|
||||
info = pa.get_device_info_by_index(i)
|
||||
name = info.get("name", "").lower()
|
||||
if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name):
|
||||
log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"]))
|
||||
return i
|
||||
|
||||
# Fallback to 'default' or 'pulse' device
|
||||
for i in range(pa.get_device_count()):
|
||||
info = pa.get_device_info_by_index(i)
|
||||
if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"):
|
||||
log.info("Mic fallback: [%d] %s", i, info["name"])
|
||||
return i
|
||||
|
||||
log.warning("No mic found, using device 0")
|
||||
return 0
|
||||
finally:
|
||||
pa.terminate()
|
||||
|
||||
# ─── MIC CALIBRATION ──────────────────────────────────
|
||||
|
||||
def _calibrate_mic(self):
|
||||
"""Calibrate barge-in threshold from ambient noise."""
|
||||
import pyaudio
|
||||
import ctypes
|
||||
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
|
||||
def _alsa_error_handler(filename, line, function, err, fmt):
|
||||
pass # suppress
|
||||
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
|
||||
try:
|
||||
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
|
||||
asound.snd_lib_error_set_handler(c_error_handler)
|
||||
except: pass # ALSA_suppress
|
||||
pa = pyaudio.PyAudio()
|
||||
mic_info = pa.get_device_info_by_index(self._mic_device_idx)
|
||||
mic_rate = int(mic_info["defaultSampleRate"])
|
||||
mic_channels = 1
|
||||
try:
|
||||
stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
|
||||
rate=mic_rate, input=True,
|
||||
input_device_index=self._mic_device_idx,
|
||||
frames_per_buffer=CHUNK_SIZE)
|
||||
values = []
|
||||
for _ in range(40):
|
||||
data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
|
||||
values.append(audio_energy(data))
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
avg_noise = sum(values) / len(values) if values else 0
|
||||
self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
|
||||
log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold)
|
||||
except Exception as e:
|
||||
log.warning("Calibration failed: %s", e)
|
||||
finally:
|
||||
pa.terminate()
|
||||
|
||||
# ─── G1 SPEAKER PLAYBACK ─────────────────────────────
|
||||
|
||||
def _play_buffer_on_g1(self, pcm_24k: np.ndarray):
|
||||
"""Play 24kHz audio on G1 speaker (resample to 16kHz, single call)."""
|
||||
if len(pcm_24k) < 100:
|
||||
return
|
||||
|
||||
# Resample 24kHz → 16kHz
|
||||
tl = int(len(pcm_24k) * 16000 / 24000)
|
||||
audio_16k = np.interp(
|
||||
np.linspace(0, len(pcm_24k), tl, endpoint=False),
|
||||
np.arange(len(pcm_24k)),
|
||||
pcm_24k.astype(np.float64),
|
||||
).astype(np.int16)
|
||||
|
||||
from unitree_sdk2py.g1.audio.g1_audio_api import (
|
||||
ROBOT_API_ID_AUDIO_START_PLAY,
|
||||
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
||||
)
|
||||
|
||||
client = self._audio._client
|
||||
if not client:
|
||||
return
|
||||
|
||||
app_name = "gemini"
|
||||
client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
|
||||
time.sleep(0.1)
|
||||
|
||||
pcm = audio_16k.tobytes()
|
||||
sid = f"s_{int(time.time() * 1000)}"
|
||||
param = json.dumps({
|
||||
"app_name": app_name,
|
||||
"stream_id": sid,
|
||||
"sample_rate": 16000,
|
||||
"channels": 1,
|
||||
"bits_per_sample": 16,
|
||||
})
|
||||
client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm))
|
||||
|
||||
duration = len(audio_16k) / 16000
|
||||
time.sleep(duration + 0.3)
|
||||
client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
|
||||
|
||||
# ─── WEBSOCKET TASKS ─────────────────────────────────
|
||||
|
||||
async def _capture_mic(self, ws):
|
||||
"""Continuously capture mic via PyAudio and send to Gemini."""
|
||||
import pyaudio
|
||||
import ctypes
|
||||
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
|
||||
def _alsa_error_handler(filename, line, function, err, fmt):
|
||||
pass # suppress
|
||||
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
|
||||
try:
|
||||
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
|
||||
asound.snd_lib_error_set_handler(c_error_handler)
|
||||
except: pass # ALSA_suppress
|
||||
pa = pyaudio.PyAudio()
|
||||
|
||||
mic_info = pa.get_device_info_by_index(self._mic_device_idx)
|
||||
mic_rate = int(mic_info["defaultSampleRate"])
|
||||
mic_channels = 1
|
||||
|
||||
# Open mic at native rate/channels
|
||||
stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
|
||||
rate=mic_rate, input=True,
|
||||
input_device_index=self._mic_device_idx,
|
||||
frames_per_buffer=CHUNK_SIZE)
|
||||
|
||||
log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels)
|
||||
|
||||
loud_chunks = 0
|
||||
loop = asyncio.get_event_loop()
|
||||
needs_resample = mic_rate != SEND_RATE or mic_channels != 1
|
||||
|
||||
try:
|
||||
while self._running:
|
||||
data = await loop.run_in_executor(
|
||||
None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False))
|
||||
|
||||
# Convert to mono 16kHz if needed
|
||||
if needs_resample:
|
||||
audio = np.frombuffer(data, dtype=np.int16)
|
||||
# Stereo to mono
|
||||
if mic_channels == 2:
|
||||
audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16)
|
||||
# Resample to 16kHz
|
||||
if mic_rate != SEND_RATE:
|
||||
tl = int(len(audio) * SEND_RATE / mic_rate)
|
||||
if tl > 0:
|
||||
audio = np.interp(
|
||||
np.linspace(0, len(audio), tl, endpoint=False),
|
||||
np.arange(len(audio)),
|
||||
audio.astype(np.float64),
|
||||
).astype(np.int16)
|
||||
data = audio.tobytes()
|
||||
|
||||
energy = audio_energy(data)
|
||||
now = time.time()
|
||||
|
||||
# Barge-in detection
|
||||
if self.speaking and now >= self._barge_in_block_until:
|
||||
if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
|
||||
if energy > self.barge_in_threshold:
|
||||
loud_chunks += 1
|
||||
else:
|
||||
loud_chunks = 0
|
||||
if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
|
||||
log.info("Barge-in detected!")
|
||||
self.interrupted = True
|
||||
self.speaking = False
|
||||
while not self._audio_queue.empty():
|
||||
try: self._audio_queue.get_nowait()
|
||||
except: break
|
||||
loud_chunks = 0
|
||||
self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
|
||||
|
||||
# Echo suppression: send silence while speaking
|
||||
data_to_send = data
|
||||
if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
|
||||
gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
|
||||
if energy < gate:
|
||||
data_to_send = SILENCE_PCM
|
||||
|
||||
# Send to Gemini
|
||||
b64 = base64.b64encode(data_to_send).decode()
|
||||
msg = {
|
||||
"realtime_input": {
|
||||
"media_chunks": [
|
||||
{"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64}
|
||||
]
|
||||
}
|
||||
}
|
||||
await ws.send(json.dumps(msg))
|
||||
|
||||
except Exception as e:
|
||||
if self._running:
|
||||
log.error("Mic error: %s", e)
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
pa.terminate()
|
||||
|
||||
async def _receive_audio(self, ws):
|
||||
"""Receive audio responses and transcriptions from Gemini."""
|
||||
async for msg in ws:
|
||||
if not self._running:
|
||||
break
|
||||
try:
|
||||
response = json.loads(msg)
|
||||
server_content = response.get("serverContent", {})
|
||||
|
||||
if server_content.get("interrupted"):
|
||||
self.interrupted = False
|
||||
|
||||
# User transcription (partial/streaming)
|
||||
input_tr = (
|
||||
server_content.get("inputTranscription")
|
||||
or server_content.get("input_transcription")
|
||||
or server_content.get("inputAudioTranscription")
|
||||
or server_content.get("input_audio_transcription")
|
||||
)
|
||||
if isinstance(input_tr, dict):
|
||||
text = (input_tr.get("text") or "").strip()
|
||||
now = time.time()
|
||||
if text and now >= self._ignore_input_until and not self.speaking:
|
||||
# Buffer ASR text
|
||||
if now - self._asr_last_time > self.ASR_WINDOW_SEC:
|
||||
self._asr_buf = ""
|
||||
self._asr_buf = text # Gemini sends cumulative transcription
|
||||
self._asr_last_time = now
|
||||
|
||||
if self.interrupted:
|
||||
continue
|
||||
|
||||
# Audio from Gemini
|
||||
model_turn = server_content.get("modelTurn")
|
||||
if model_turn:
|
||||
for part in model_turn.get("parts", []):
|
||||
inline_data = part.get("inlineData")
|
||||
if inline_data:
|
||||
audio_b64 = inline_data.get("data")
|
||||
if audio_b64:
|
||||
now = time.time()
|
||||
if not self.speaking:
|
||||
self._ai_speaking_since = now
|
||||
# Gemini started responding — fire transcript callback
|
||||
if self._asr_buf and self._on_transcript:
|
||||
self._on_transcript(self._asr_buf, "user")
|
||||
self.speaking = True
|
||||
self._last_ai_audio_time = now
|
||||
self._ignore_input_until = now + self.ECHO_GUARD_SEC
|
||||
audio_bytes = base64.b64decode(audio_b64)
|
||||
await self._audio_queue.put(audio_bytes)
|
||||
|
||||
# Text from Gemini (thinking/response text)
|
||||
text_part = part.get("text", "").strip()
|
||||
if text_part and self._on_transcript:
|
||||
self._on_transcript(text_part, "marcus")
|
||||
|
||||
# Turn complete — Gemini finished speaking
|
||||
turn_complete = server_content.get("turnComplete")
|
||||
if turn_complete:
|
||||
# Clear ASR buffer after turn
|
||||
self._asr_buf = ""
|
||||
|
||||
except Exception as e:
|
||||
log.error("Receive error: %s", e)
|
||||
|
||||
async def _play_audio(self):
|
||||
"""Collect Gemini audio chunks and play on G1 speaker."""
|
||||
while self._running:
|
||||
try:
|
||||
if not self.speaking:
|
||||
await asyncio.sleep(0.05)
|
||||
continue
|
||||
|
||||
# Pre-buffer
|
||||
buffered = False
|
||||
while self.speaking and not buffered:
|
||||
if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS:
|
||||
buffered = True
|
||||
else:
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
# Collect all audio chunks
|
||||
buffer_chunks = []
|
||||
while self.speaking:
|
||||
try:
|
||||
data = await asyncio.wait_for(
|
||||
self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT)
|
||||
audio = np.frombuffer(data, dtype=np.int16)
|
||||
buffer_chunks.append(audio)
|
||||
self._last_ai_audio_time = time.time()
|
||||
except asyncio.TimeoutError:
|
||||
if self._audio_queue.empty():
|
||||
if time.time() - self._last_ai_audio_time > 0.3:
|
||||
break
|
||||
|
||||
# Play on G1 speaker
|
||||
if buffer_chunks:
|
||||
full_audio = np.concatenate(buffer_chunks)
|
||||
duration = len(full_audio) / RECEIVE_RATE
|
||||
log.info("Playing %.1fs on G1", duration)
|
||||
|
||||
await asyncio.get_event_loop().run_in_executor(
|
||||
None, self._play_buffer_on_g1, full_audio)
|
||||
|
||||
self.speaking = False
|
||||
|
||||
except Exception as e:
|
||||
log.error("Play error: %s", e)
|
||||
self.speaking = False
|
||||
|
||||
# ─── MAIN LOOP ────────────────────────────────────────
|
||||
|
||||
async def _run_async(self):
|
||||
import websockets
|
||||
import inspect
|
||||
|
||||
system_prompt = load_system_prompt()
|
||||
|
||||
# Unmute mic
|
||||
subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
|
||||
subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
|
||||
|
||||
# Calibrate
|
||||
self._calibrate_mic()
|
||||
|
||||
ws_kwargs = {"max_size": None}
|
||||
try:
|
||||
sig = inspect.signature(websockets.connect)
|
||||
if "extra_headers" in sig.parameters:
|
||||
ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
|
||||
else:
|
||||
ws_kwargs["additional_headers"] = {"Content-Type": "application/json"}
|
||||
except Exception:
|
||||
ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
|
||||
|
||||
while self._running:
|
||||
try:
|
||||
log.info("Connecting to Gemini...")
|
||||
async with websockets.connect(URI, **ws_kwargs) as ws:
|
||||
setup_msg = {
|
||||
"setup": {
|
||||
"model": MODEL,
|
||||
"generationConfig": {
|
||||
"responseModalities": ["AUDIO"],
|
||||
"thinkingConfig": {"thinkingBudget": 0},
|
||||
"speechConfig": {
|
||||
"voiceConfig": {
|
||||
"prebuiltVoiceConfig": {"voiceName": VOICE_NAME}
|
||||
}
|
||||
},
|
||||
},
|
||||
"realtimeInputConfig": {
|
||||
"automaticActivityDetection": {
|
||||
"startOfSpeechSensitivity": "START_SENSITIVITY_HIGH",
|
||||
"prefixPaddingMs": 40,
|
||||
"endOfSpeechSensitivity": "END_SENSITIVITY_HIGH",
|
||||
"silenceDurationMs": 250,
|
||||
}
|
||||
},
|
||||
"inputAudioTranscription": {},
|
||||
"systemInstruction": {"parts": [{"text": system_prompt}]},
|
||||
}
|
||||
}
|
||||
await ws.send(json.dumps(setup_msg))
|
||||
await ws.recv()
|
||||
log.info("Connected! Always listening...")
|
||||
|
||||
self._audio_queue = asyncio.Queue()
|
||||
|
||||
await asyncio.gather(
|
||||
self._capture_mic(ws),
|
||||
self._receive_audio(ws),
|
||||
self._play_audio(),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
if self._running:
|
||||
log.error("Connection error: %s — reconnecting in 3s", e)
|
||||
await asyncio.sleep(3)
|
||||
|
||||
def _voice_thread(self):
|
||||
asyncio.run(self._run_async())
|
||||
|
||||
# ─── START / STOP ─────────────────────────────────────
|
||||
|
||||
def start(self):
|
||||
if self._running:
|
||||
return
|
||||
self._running = True
|
||||
self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice")
|
||||
self._thread.start()
|
||||
log.info("Gemini voice module started")
|
||||
|
||||
def stop(self):
|
||||
self._running = False
|
||||
if self._thread:
|
||||
self._thread.join(timeout=5)
|
||||
self._thread = None
|
||||
log.info("Gemini voice module stopped")
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
return self._running
|
||||
|
||||
@property
|
||||
def state(self) -> str:
|
||||
return "LISTENING" if self._running else "STOPPED"
|
||||
|
||||
@property
|
||||
def is_speaking(self) -> bool:
|
||||
return self.speaking
|
||||
|
||||
|
||||
# ─── STANDALONE TEST ─────────────────────────────────────
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
sys.path.insert(0, PROJECT_ROOT)
|
||||
from API.audio_api import AudioAPI
|
||||
|
||||
def on_transcript(text, role):
|
||||
print(f" [{role.upper()}] {text}")
|
||||
|
||||
audio = AudioAPI()
|
||||
voice = GeminiVoiceModule(audio, on_transcript=on_transcript)
|
||||
|
||||
print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n")
|
||||
voice.start()
|
||||
|
||||
try:
|
||||
while voice.is_running:
|
||||
time.sleep(0.5)
|
||||
except KeyboardInterrupt:
|
||||
print("\nStopping...")
|
||||
voice.stop()
|
||||
@ -1,19 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Features/Voice/marcus_voice.py — Marcus Always-Listening Voice Module
|
||||
======================================================================
|
||||
Voice/marcus_voice.py — Marcus Always-Listening Voice Module (English)
|
||||
=======================================================================
|
||||
State machine:
|
||||
IDLE → (wake word detected) → WAKE_HEARD
|
||||
WAKE_HEARD → (record command) → PROCESSING
|
||||
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
|
||||
SPEAKING → (TTS done) → IDLE
|
||||
|
||||
Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny)
|
||||
Wake word: "Marcus" (detected by Whisper tiny)
|
||||
Commands: Transcribed by Whisper small
|
||||
TTS: Handled by API/audio_api.py
|
||||
Mic: G1 built-in array mic via UDP multicast (Voice/builtin_mic.py)
|
||||
TTS: English only, Unitree built-in TtsMaker (API/audio_api.py)
|
||||
|
||||
Usage:
|
||||
from Features.Voice.marcus_voice import VoiceModule
|
||||
from Voice.marcus_voice import VoiceModule
|
||||
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
|
||||
voice.start() # background thread
|
||||
voice.stop()
|
||||
@ -21,7 +22,6 @@ Usage:
|
||||
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import numpy as np
|
||||
@ -74,7 +74,8 @@ class VoiceModule:
|
||||
"""
|
||||
Args:
|
||||
audio_api: AudioAPI instance (from API/audio_api.py)
|
||||
on_command: callback(text: str, lang: str) — called when command is transcribed
|
||||
on_command: callback(text: str, lang: str) — "lang" is always "en"
|
||||
now; kept in the signature for interface stability.
|
||||
"""
|
||||
self._audio = audio_api
|
||||
self._on_command = on_command
|
||||
@ -83,13 +84,23 @@ class VoiceModule:
|
||||
self._stt = self._config["stt"]
|
||||
self._mic = self._config["mic"]
|
||||
|
||||
# Whisper models — lazy loaded
|
||||
# Whisper models — lazy loaded on first _voice_loop() iteration
|
||||
self._wake_model = None
|
||||
self._cmd_model = None
|
||||
|
||||
# Wake words
|
||||
self._wake_en = [w.lower() for w in self._stt["wake_words_en"]]
|
||||
self._wake_ar = self._stt["wake_words_ar"]
|
||||
# Wake words (English only — built-in TTS doesn't do Arabic)
|
||||
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
|
||||
["marcus", "marcos"])]
|
||||
|
||||
# G1 built-in mic (UDP multicast).
|
||||
from Voice.builtin_mic import BuiltinMic
|
||||
_mcfg = self._config.get("mic_udp", {})
|
||||
self._mic_capture = BuiltinMic(
|
||||
group=_mcfg.get("group", "239.168.123.161"),
|
||||
port=_mcfg.get("port", 5555),
|
||||
buf_max=_mcfg.get("buffer_max_bytes", 64000),
|
||||
)
|
||||
self._sample_rate = self._mic_capture.sample_rate # 16000
|
||||
|
||||
# State
|
||||
self._state = State.IDLE
|
||||
@ -97,7 +108,7 @@ class VoiceModule:
|
||||
self._thread = None
|
||||
self._lock = threading.Lock()
|
||||
|
||||
log.info("VoiceModule initialized")
|
||||
log.info("VoiceModule initialized (mic: G1 built-in UDP)")
|
||||
|
||||
# ─── MODEL LOADING ────────────────────────────────────
|
||||
|
||||
@ -115,69 +126,49 @@ class VoiceModule:
|
||||
self._cmd_model = whisper.load_model(self._stt["command_model"])
|
||||
log.info("Command model ready")
|
||||
|
||||
# ─── MIC RECORDING ────────────────────────────────────
|
||||
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
|
||||
|
||||
def _record_chunk(self, seconds: float) -> np.ndarray:
|
||||
"""Record audio chunk from mic via parec."""
|
||||
source = self._mic["source_index"]
|
||||
rate = str(self._mic["rate"])
|
||||
|
||||
proc = subprocess.Popen(
|
||||
["parec", "-d", source,
|
||||
"--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
time.sleep(seconds)
|
||||
proc.terminate()
|
||||
raw = proc.stdout.read()
|
||||
return np.frombuffer(raw, dtype=np.int16)
|
||||
"""Capture a fixed-duration chunk from the G1 built-in mic."""
|
||||
num_bytes = int(seconds * self._sample_rate * 2) # int16 mono
|
||||
raw = bytearray()
|
||||
bite = 1024
|
||||
while len(raw) < num_bytes:
|
||||
raw.extend(self._mic_capture.read_chunk(min(bite, num_bytes - len(raw))))
|
||||
return np.frombuffer(bytes(raw), dtype=np.int16)
|
||||
|
||||
def _record_until_silence(self) -> np.ndarray:
|
||||
"""Record until silence is detected or max duration reached."""
|
||||
source = self._mic["source_index"]
|
||||
rate = self._mic["rate"]
|
||||
threshold = self._stt["silence_threshold"]
|
||||
silence_dur = self._stt["silence_duration_sec"]
|
||||
max_dur = self._stt["max_record_sec"]
|
||||
"""Capture until RMS drops below threshold for `silence_duration_sec`."""
|
||||
threshold = self._stt.get("silence_threshold", 500)
|
||||
silence_dur = self._stt.get("silence_duration_sec", 1.5)
|
||||
max_dur = self._stt.get("max_record_sec", 15)
|
||||
|
||||
chunk_sec = 0.5
|
||||
chunk_samples = int(rate * chunk_sec)
|
||||
silence_chunks_needed = int(silence_dur / chunk_sec)
|
||||
max_chunks = int(max_dur / chunk_sec)
|
||||
chunk_sec = 0.5
|
||||
chunk_bytes = int(self._sample_rate * chunk_sec) * 2
|
||||
silence_chunks_need = int(silence_dur / chunk_sec)
|
||||
max_chunks = int(max_dur / chunk_sec)
|
||||
|
||||
proc = subprocess.Popen(
|
||||
["parec", "-d", source,
|
||||
"--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
|
||||
stdout=subprocess.PIPE,
|
||||
)
|
||||
|
||||
all_audio = []
|
||||
all_audio = []
|
||||
silence_count = 0
|
||||
chunk_count = 0
|
||||
chunk_count = 0
|
||||
|
||||
try:
|
||||
while chunk_count < max_chunks:
|
||||
data = proc.stdout.read(chunk_samples * 2) # 2 bytes per sample
|
||||
if not data:
|
||||
break
|
||||
while chunk_count < max_chunks:
|
||||
raw = self._mic_capture.read_chunk(chunk_bytes)
|
||||
if not raw:
|
||||
break
|
||||
chunk = np.frombuffer(raw, dtype=np.int16)
|
||||
all_audio.append(chunk)
|
||||
chunk_count += 1
|
||||
|
||||
chunk = np.frombuffer(data, dtype=np.int16)
|
||||
all_audio.append(chunk)
|
||||
chunk_count += 1
|
||||
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
|
||||
if rms < threshold:
|
||||
silence_count += 1
|
||||
else:
|
||||
silence_count = 0
|
||||
|
||||
# Check for silence
|
||||
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
|
||||
if rms < threshold:
|
||||
silence_count += 1
|
||||
else:
|
||||
silence_count = 0
|
||||
|
||||
if silence_count >= silence_chunks_needed and chunk_count > 2:
|
||||
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
|
||||
break
|
||||
finally:
|
||||
proc.terminate()
|
||||
proc.stdout.read() # drain
|
||||
if silence_count >= silence_chunks_need and chunk_count > 2:
|
||||
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
|
||||
break
|
||||
|
||||
if all_audio:
|
||||
return np.concatenate(all_audio)
|
||||
@ -205,38 +196,18 @@ class VoiceModule:
|
||||
return text
|
||||
|
||||
def _check_wake_word(self, text: str) -> bool:
|
||||
"""Check if transcribed text contains a wake word."""
|
||||
"""Check if transcribed text contains an English wake word."""
|
||||
text_lower = text.lower().strip()
|
||||
|
||||
# English wake words
|
||||
for w in self._wake_en:
|
||||
if w in text_lower:
|
||||
return True
|
||||
|
||||
# Arabic wake words
|
||||
for w in self._wake_ar:
|
||||
if w in text:
|
||||
return True
|
||||
|
||||
return False
|
||||
return any(w in text_lower for w in self._wake_en)
|
||||
|
||||
# ─── MAIN LOOP ────────────────────────────────────────
|
||||
|
||||
def _voice_loop(self):
|
||||
"""Main voice processing loop — runs in background thread."""
|
||||
self._load_whisper()
|
||||
self._mic_capture.start()
|
||||
log.info("Voice loop started — listening for wake word...")
|
||||
|
||||
# Unmute mic once
|
||||
subprocess.run(
|
||||
["pactl", "set-source-mute", self._mic["source_index"], "0"],
|
||||
capture_output=True,
|
||||
)
|
||||
subprocess.run(
|
||||
["pactl", "set-source-volume", self._mic["source_index"], "100%"],
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
while self._running:
|
||||
try:
|
||||
if self._state == State.IDLE:
|
||||
@ -279,9 +250,7 @@ class VoiceModule:
|
||||
self._state = State.WAKE_HEARD
|
||||
|
||||
# Acknowledge
|
||||
self._audio.speak(
|
||||
self._config["messages"]["wake_heard"], "en"
|
||||
)
|
||||
self._audio.speak(self._config["messages"]["wake_heard"])
|
||||
|
||||
def _do_wake_heard(self):
|
||||
"""Record the command until silence."""
|
||||
@ -294,7 +263,7 @@ class VoiceModule:
|
||||
|
||||
if len(audio) < 4000: # < 0.25s at 16kHz
|
||||
log.info("Too short, ignoring")
|
||||
self._audio.speak(self._config["messages"]["no_speech"], "en")
|
||||
self._audio.speak(self._config["messages"]["no_speech"])
|
||||
self._state = State.IDLE
|
||||
return
|
||||
|
||||
@ -308,18 +277,16 @@ class VoiceModule:
|
||||
|
||||
if not text or len(text.strip()) < 2:
|
||||
log.info("Empty transcription")
|
||||
self._audio.speak(self._config["messages"]["no_speech"], "en")
|
||||
self._audio.speak(self._config["messages"]["no_speech"])
|
||||
self._state = State.IDLE
|
||||
return
|
||||
|
||||
# Detect language
|
||||
lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
|
||||
log.info("Command [%s]: %s", lang, text)
|
||||
log.info("Command: %s", text)
|
||||
|
||||
# Send to brain callback
|
||||
# Send to brain callback (lang always "en" in this build)
|
||||
if self._on_command:
|
||||
try:
|
||||
self._on_command(text, lang)
|
||||
self._on_command(text, "en")
|
||||
except Exception as e:
|
||||
log.error("Brain callback error: %s", e)
|
||||
|
||||
@ -342,6 +309,10 @@ class VoiceModule:
|
||||
def stop(self):
|
||||
"""Stop voice listening."""
|
||||
self._running = False
|
||||
try:
|
||||
self._mic_capture.stop()
|
||||
except Exception:
|
||||
pass
|
||||
if self._thread:
|
||||
self._thread.join(timeout=5)
|
||||
self._thread = None
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user