Marcus/Voice/gemini_runner.py

662 lines
24 KiB
Python

#!/usr/bin/env python3
"""Voice/gemini_runner.py — Gemini Live S2S subprocess (Option 3).
Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so
it can import `google-genai` (which doesn't support Python 3.8). The
marcus env itself is pinned to Python 3.8 by the NVIDIA Jetson torch
wheel, so Gemini has to live in its own process.
This is the full Sanad-pattern speech-to-speech variant:
- response_modalities=["AUDIO"] → Gemini speaks back through G1 speaker
- input_audio_transcription → user transcripts emitted on stdout for
Marcus's wake-word side channel
- output_audio_transcription → Gemini's reply text logged for review
- barge-in detection → user speaking over AI cuts AI off
- echo suppression → mic muted during AI playback
- JPEG camera frames over stdin → Marcus parent forwards frames; runner
streams them to Gemini Live so the
vision answers ("what do you see")
are correct, not hallucinated.
The runner owns the G1 mic AND the G1 speaker (unitree_sdk2py works fine
in the gemini_sdk env on this Jetson — already verified).
────────────────────────────────────────────────────────────────────────
Stdout protocol (one JSON object per line, UTF-8):
{"type":"ready"} session connected
{"type":"user", "text":"..."} user input transcription
{"type":"bot", "text":"..."} Gemini's reply text (logged only — Gemini also speaks it)
{"type":"turn_end"}
{"type":"barge_in"}
{"type":"reconnect", "reason":"..."}
{"type":"log", "level":"info|warn|error", "msg":"..."}
Stdin protocol (line-based):
"stop\n" graceful shutdown
"flush\n" drop mic buffer (echo prevention)
"frame:<base64-jpeg>\n" forward a camera frame to Gemini Live
(Marcus parent throttles to ~2 fps)
────────────────────────────────────────────────────────────────────────
Env vars:
MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY)
MARCUS_GEMINI_MODEL (optional)
MARCUS_GEMINI_VOICE (optional)
MARCUS_PROJECT_ROOT (optional)
"""
from __future__ import annotations
import array
import asyncio
import base64
import json
import os
import signal
import sys
import threading
import time
from typing import Any
import numpy as np
_PROJECT_ROOT = (
os.environ.get("MARCUS_PROJECT_ROOT")
or os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
from Voice.audio_io import BuiltinMic, BuiltinSpeaker
from Voice.turn_recorder import TurnRecorder
try:
from Core.config_loader import load_config
_VCFG = load_config("Voice") or {}
except Exception:
_VCFG = {}
_STT = _VCFG.get("stt", {})
_SPK = _VCFG.get("speaker", {})
# ─── stdout / stderr helpers ─────────────────────────────────────
_stdout_lock = threading.Lock()
def emit(payload: dict) -> None:
line = json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
with _stdout_lock:
sys.stdout.write(line + "\n")
sys.stdout.flush()
def log(level: str, msg: str) -> None:
emit({"type": "log", "level": level, "msg": msg})
# ─── stdin watcher (graceful shutdown + flush + frames) ──────────
_STOP_REQUESTED = threading.Event()
_MIC_HOLDER: list = [] # [BuiltinMic] when active
# Latest JPEG frame from the parent (raw bytes). The frame-send loop
# picks this up and ships it to Gemini Live.
_LATEST_FRAME_LOCK = threading.Lock()
_LATEST_FRAME: dict = {"bytes": None, "ts": 0.0}
def _stdin_watcher() -> None:
try:
for line in sys.stdin:
line = line.rstrip("\n")
if not line:
continue
# Match the cheap commands first, then fall through to frame:
cmd = line.lower()
if cmd == "stop":
log("info", "stop received from parent — exiting")
_STOP_REQUESTED.set()
return
elif cmd == "flush":
if _MIC_HOLDER:
try:
_MIC_HOLDER[0].flush()
except Exception:
pass
elif line.startswith("frame:"):
b64 = line[len("frame:"):]
try:
data = base64.b64decode(b64)
except Exception:
continue
if data:
with _LATEST_FRAME_LOCK:
_LATEST_FRAME["bytes"] = data
_LATEST_FRAME["ts"] = time.time()
except Exception:
return
threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()
def _install_signal_handlers() -> None:
def _handle(_signum, _frame):
log("info", "signal received — exiting")
_STOP_REQUESTED.set()
for sig in (signal.SIGTERM, signal.SIGINT):
try:
signal.signal(sig, _handle)
except Exception:
pass
# ─── tunables ────────────────────────────────────────────────────
_MODEL = os.environ.get(
"MARCUS_GEMINI_MODEL",
_STT.get("gemini_model", "gemini-2.5-flash-native-audio-preview-12-2025"),
)
_VOICE = os.environ.get(
"MARCUS_GEMINI_VOICE",
_STT.get("gemini_voice_name", "Charon"),
)
_API_KEY = (
os.environ.get("MARCUS_GEMINI_API_KEY")
or os.environ.get("SANAD_GEMINI_API_KEY")
or _STT.get("gemini_api_key", "")
)
_MIC_GAIN = float(_STT.get("mic_gain", 1.0))
_SESSION_TIMEOUT = float(_STT.get("gemini_session_timeout_sec", 660))
_MAX_RECONNECT_DELAY = float(_STT.get("gemini_max_reconnect_delay_sec", 30))
_MAX_CONSECUTIVE_ERRORS = int(_STT.get("gemini_max_consecutive_errors", 10))
_NO_MESSAGES_TIMEOUT = float(_STT.get("gemini_no_messages_timeout_sec", 30))
SEND_SAMPLE_RATE = int(_STT.get("gemini_send_sample_rate", 16000))
RECEIVE_SAMPLE_RATE = int(_STT.get("gemini_receive_sample_rate", 24000))
CHUNK_SIZE = int(_STT.get("gemini_chunk_size", 512))
_CHUNK_BYTES = CHUNK_SIZE * 2
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES
_BARGE_THRESHOLD = int(_STT.get("gemini_barge_threshold", 500))
_BARGE_CHUNKS = int(_STT.get("gemini_barge_loud_chunks_needed", 3))
_BARGE_COOLDOWN = float(_STT.get("gemini_barge_cooldown_sec", 0.3))
_ECHO_SUPPRESS_BELOW = int(_STT.get("gemini_echo_suppress_below", 500))
_AI_GRACE = float(_STT.get("gemini_ai_speak_grace_sec", 0.15))
_FRAME_INTERVAL = float(_STT.get("gemini_frame_interval_sec", 0.5))
_FRAME_MAX_AGE = float(_STT.get("gemini_frame_max_age_sec", 1.5))
_REC_ENABLED = bool(_STT.get("gemini_record_enabled", True))
_DATA_DIR = os.path.join(
_PROJECT_ROOT,
_VCFG.get("audio", {}).get("data_dir", "Data/Voice/Recordings"),
"gemini_turns",
)
_SYS_PROMPT = (
os.environ.get("MARCUS_GEMINI_SYSTEM_PROMPT")
or _STT.get(
"gemini_system_prompt",
"You are Sanad, a humanoid robot assistant. Reply briefly.",
)
)
_SP_FILE = _STT.get("gemini_system_prompt_file", "")
if _SP_FILE:
_sp_path = (
_SP_FILE if os.path.isabs(_SP_FILE)
else os.path.join(_PROJECT_ROOT, _SP_FILE)
)
try:
with open(_sp_path, "r", encoding="utf-8") as f:
txt = f.read().strip()
if txt:
_SYS_PROMPT = txt
except Exception:
pass
def _audio_energy(pcm: bytes) -> int:
try:
samples = array.array("h", pcm)
return sum(abs(s) for s in samples) // len(samples) if samples else 0
except Exception:
return 0
# ─── G1 speaker (audio playback) — initialized in main_async ────
def _init_g1_speaker() -> BuiltinSpeaker | None:
"""Initialise the G1 DDS audio client and wrap it in a BuiltinSpeaker.
Sanad's audio_io.BuiltinSpeaker normally takes an already-initialised
AudioClient. This subprocess owns its own DDS init.
"""
try:
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
except Exception as e:
log("error", f"unitree_sdk2py not importable in this env: {e}")
return None
iface = _SPK.get("dds_interface", "eth0")
try:
ChannelFactoryInitialize(0, iface)
except Exception as e:
# Already initialised in this process — that's fine.
log("warn", f"ChannelFactoryInitialize: {e}")
try:
ac = AudioClient()
ac.SetTimeout(10.0)
ac.Init()
try:
ac.SetVolume(int(_SPK.get("volume", 100)))
except Exception:
pass
except Exception as e:
log("error", f"AudioClient init failed: {e}")
return None
return BuiltinSpeaker(ac, app_name=_SPK.get("app_name", "sanad"))
# ─── per-session state (reset on each connect) ───────────────────
class _Session:
def __init__(self):
self.speaking = False
self.stream_started = False
self.barge_block_until = 0.0
self.ai_speak_start = 0.0
self.last_ai_audio = 0.0
# ─── main async loop ─────────────────────────────────────────────
def _build_config(types):
vad_start = _STT.get("gemini_vad_start_sensitivity", "START_SENSITIVITY_HIGH")
vad_end = _STT.get("gemini_vad_end_sensitivity", "END_SENSITIVITY_LOW")
prefix_ms = int(_STT.get("gemini_vad_prefix_padding_ms", 20))
silence_ms = int(_STT.get("gemini_vad_silence_duration_ms", 200))
return types.LiveConnectConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=_VOICE,
),
),
),
realtime_input_config=types.RealtimeInputConfig(
automatic_activity_detection=types.AutomaticActivityDetection(
disabled=False,
start_of_speech_sensitivity=getattr(types.StartSensitivity, vad_start),
end_of_speech_sensitivity=getattr(types.EndSensitivity, vad_end),
prefix_padding_ms=prefix_ms,
silence_duration_ms=silence_ms,
),
),
input_audio_transcription=types.AudioTranscriptionConfig(),
output_audio_transcription=types.AudioTranscriptionConfig(),
system_instruction=types.Content(
parts=[types.Part(text=_SYS_PROMPT)],
),
)
async def _send_mic_loop(session, types_mod, mic, speaker, recorder, sess: _Session, done: asyncio.Event) -> None:
loop = asyncio.get_event_loop()
frame_pause = CHUNK_SIZE / float(SEND_SAMPLE_RATE)
loud_count = 0
last_activity = time.time()
while not done.is_set() and not _STOP_REQUESTED.is_set():
try:
raw = await loop.run_in_executor(None, mic.read_chunk, _CHUNK_BYTES)
except Exception as e:
log("warn", f"mic read failed: {e}")
break
if not raw:
await asyncio.sleep(frame_pause)
continue
if _MIC_GAIN != 1.0:
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
raw = samples.tobytes()
energy = _audio_energy(raw)
now = time.time()
# Barge-in detection — sustained user energy interrupts the AI.
if sess.speaking and now >= sess.barge_block_until:
if (now - sess.ai_speak_start) >= _AI_GRACE:
if energy > _BARGE_THRESHOLD:
loud_count += 1
else:
loud_count = max(0, loud_count - 1)
if loud_count > _BARGE_CHUNKS:
log("info", f"BARGE-IN (e={energy})")
emit({"type": "barge_in"})
sess.speaking = False
sess.stream_started = False
try:
speaker.stop()
except Exception:
pass
try:
mic.flush()
except Exception:
pass
try:
recorder.finish_turn()
except Exception:
pass
loud_count = 0
sess.barge_block_until = now + _BARGE_COOLDOWN
# Echo suppression — while AI speaks, mute quiet mic frames so the
# mic doesn't feed Gemini its own voice.
send_data = raw
if sess.speaking and energy < _ECHO_SUPPRESS_BELOW:
send_data = _SILENCE_PCM
# Capture user audio for the per-turn WAV (only when user actually speaks).
if energy > 250 and not sess.speaking:
try:
recorder.capture_user(raw)
except Exception:
pass
if energy > 250:
last_activity = now
elif now - last_activity > 10:
log("info", f"alive (no speech {now - last_activity:.0f}s)")
last_activity = now
try:
await session.send_realtime_input(
audio=types_mod.Blob(
data=send_data,
mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
),
)
except asyncio.CancelledError:
return
except Exception as e:
log("warn", f"mic send failed: {e}")
done.set()
return
await asyncio.sleep(frame_pause)
async def _send_frame_loop(session, types_mod, done: asyncio.Event) -> None:
"""Periodically push the latest cached camera frame (JPEG) to Gemini Live."""
while not done.is_set() and not _STOP_REQUESTED.is_set():
await asyncio.sleep(_FRAME_INTERVAL)
with _LATEST_FRAME_LOCK:
data = _LATEST_FRAME.get("bytes")
ts = _LATEST_FRAME.get("ts", 0.0)
if not data:
continue
if (time.time() - ts) > _FRAME_MAX_AGE:
# Stale — don't waste tokens streaming a frame the camera abandoned.
continue
try:
await session.send_realtime_input(
video=types_mod.Blob(data=data, mime_type="image/jpeg"),
)
except asyncio.CancelledError:
return
except Exception as e:
log("warn", f"frame send failed: {e}")
# Keep going — frames are best-effort.
async def _receive_loop(session, speaker, recorder, sess: _Session, done: asyncio.Event) -> None:
loop = asyncio.get_event_loop()
last_recv = time.time()
try:
while not done.is_set() and not _STOP_REQUESTED.is_set():
async for response in session.receive():
last_recv = time.time()
if done.is_set():
break
if (hasattr(response, "go_away")
and getattr(response, "go_away", None) is not None):
emit({"type": "reconnect", "reason": "server go_away"})
done.set()
return
sc = getattr(response, "server_content", None)
if sc is None:
continue
if getattr(sc, "interrupted", False) is True:
if sess.speaking:
log("info", "Gemini interrupted by server")
sess.speaking = False
sess.stream_started = False
try:
speaker.stop()
except Exception:
pass
try:
recorder.finish_turn()
except Exception:
pass
continue
it = getattr(sc, "input_transcription", None)
if it is not None:
text = (getattr(it, "text", "") or "").strip()
if text and not sess.speaking:
emit({"type": "user", "text": text})
try:
recorder.add_user_text(text)
except Exception:
pass
ot = getattr(sc, "output_transcription", None)
if ot is not None:
text = (getattr(ot, "text", "") or "").strip()
if text:
emit({"type": "bot", "text": text})
try:
recorder.add_robot_text(text)
except Exception:
pass
mt = getattr(sc, "model_turn", None)
if mt is not None:
for part in getattr(mt, "parts", []) or []:
inl = getattr(part, "inline_data", None)
if inl is not None and getattr(inl, "data", None):
now = time.time()
if not sess.speaking:
sess.ai_speak_start = now
sess.speaking = True
sess.last_ai_audio = now
raw_audio = inl.data
try:
recorder.capture_robot(raw_audio)
except Exception:
pass
audio_arr = np.frombuffer(raw_audio, dtype=np.int16)
if not sess.stream_started:
await loop.run_in_executor(None, speaker.begin_stream)
sess.stream_started = True
await loop.run_in_executor(
None, speaker.send_chunk,
audio_arr, RECEIVE_SAMPLE_RATE,
)
if getattr(sc, "turn_complete", False):
if sess.speaking and sess.stream_started and not speaker.interrupted:
log("info", f"speaker {speaker.total_sent_sec:.1f}s")
await loop.run_in_executor(None, speaker.wait_finish)
elif sess.speaking and speaker.interrupted:
log("info", "speaker interrupted")
sess.speaking = False
sess.stream_started = False
try:
# Drop any echo of the just-played reply.
_MIC_HOLDER and _MIC_HOLDER[0].flush()
except Exception:
pass
try:
recorder.finish_turn()
except Exception:
pass
emit({"type": "turn_end"})
if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
log("warn", f"no messages from Gemini for {_NO_MESSAGES_TIMEOUT:.0f}s")
break
await asyncio.sleep(0.1)
except asyncio.CancelledError:
return
except Exception as e:
log("warn", f"receive ended: {e}")
finally:
done.set()
async def main_async() -> int:
if not _API_KEY:
log("error", "no Gemini API key (set MARCUS_GEMINI_API_KEY)")
return 3
try:
from google import genai
from google.genai import types
except Exception as e:
log("error", f"google-genai not importable: {e}")
return 2
try:
client = genai.Client(api_key=_API_KEY)
except Exception as e:
log("error", f"failed to create Gemini client: {e}")
return 4
config = _build_config(types)
mic = BuiltinMic()
mic.start()
_MIC_HOLDER.append(mic)
speaker = _init_g1_speaker()
if speaker is None:
log("error", "G1 speaker not available — exiting")
try:
mic.stop()
except Exception:
pass
return 5
recorder = TurnRecorder(
enabled=_REC_ENABLED,
out_dir=_DATA_DIR,
user_rate=SEND_SAMPLE_RATE,
robot_rate=RECEIVE_SAMPLE_RATE,
)
session_num = 0
consecutive_errors = 0
start = time.time()
try:
while not _STOP_REQUESTED.is_set():
session_num += 1
uptime_min = (time.time() - start) / 60
try:
log("info", f"connecting (session #{session_num}, uptime {uptime_min:.0f}m)")
async with client.aio.live.connect(model=_MODEL, config=config) as session:
emit({"type": "ready"})
consecutive_errors = 0
mic.flush()
sess = _Session()
done = asyncio.Event()
try:
await asyncio.wait_for(
asyncio.gather(
_send_mic_loop(session, types, mic, speaker, recorder, sess, done),
_send_frame_loop(session, types, done),
_receive_loop(session, speaker, recorder, sess, done),
),
timeout=_SESSION_TIMEOUT,
)
except asyncio.TimeoutError:
log("info", f"session timed out after {_SESSION_TIMEOUT:.0f}s")
except asyncio.CancelledError:
pass
log("info", f"session #{session_num} ended — reconnecting in 1s")
try:
speaker.stop()
except Exception:
pass
try:
mic.flush()
except Exception:
pass
if _STOP_REQUESTED.is_set():
break
await asyncio.sleep(1)
except asyncio.CancelledError:
break
except Exception as e:
consecutive_errors += 1
delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
log("error", f"session error #{consecutive_errors}: {e} — retry in {delay:.0f}s")
try:
await asyncio.sleep(delay)
except asyncio.CancelledError:
break
if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
log("warn", f"{consecutive_errors} consecutive errors — recreating client")
try:
client = genai.Client(api_key=_API_KEY)
consecutive_errors = 0
except Exception as ce:
log("error", f"client recreation failed: {ce}")
finally:
try:
mic.stop()
except Exception:
pass
try:
speaker.stop()
except Exception:
pass
return 0
def main() -> int:
_install_signal_handlers()
try:
return asyncio.run(main_async())
except KeyboardInterrupt:
return 0
except Exception as e:
log("error", f"fatal: {e}")
return 4
if __name__ == "__main__":
sys.exit(main())