#!/usr/bin/env python3 """Voice/gemini_runner.py — Gemini Live STT subprocess. Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so it can import `google-genai`, which doesn't support Python 3.8. The marcus env itself is pinned to Python 3.8 by the NVIDIA Jetson torch wheel, so Gemini has to live in its own process — the same pattern Sanad uses. The marcus parent process spawns this script via: /path/to/gemini_sdk/python -u Voice/gemini_runner.py and parses the JSON-lines stream we emit on stdout. The parent never sees audio bytes — this script owns the mic, the Gemini WebSocket, AND the WAV recording, so the IPC boundary stays narrow (just transcripts). ──────────────────────────────────────────────────────────────────────── Stdout protocol (one JSON object per line, UTF-8): {"type":"ready"} session connected, mic is live {"type":"user", "text":"..."} user input transcription {"type":"bot", "text":"..."} Gemini's text reply (logged only — never spoken) {"type":"turn_end"} Gemini emitted turn_complete {"type":"reconnect", "reason":"..."} session ended, will reconnect {"type":"log", "level":"info|warn|error", "msg":"..."} Stdin protocol (line-based): "stop\n" request graceful shutdown Exit codes: 0 — clean shutdown after "stop" or signal 2 — google-genai not importable 3 — no API key 4 — fatal session loop crash ──────────────────────────────────────────────────────────────────────── Env vars: MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY) — required MARCUS_GEMINI_MODEL (optional) — model id MARCUS_GEMINI_VOICE (optional, ignored in TEXT mode) MARCUS_PROJECT_ROOT (optional) — for sys.path This file uses Python 3.10+ syntax — type unions with `|`, etc. — because the gemini_sdk env is 3.10+. DO NOT try to import it from marcus 3.8. """ from __future__ import annotations import asyncio import json import os import signal import sys import threading import time from typing import Any import numpy as np # Make the Marcus project importable so we can reuse Voice/audio_io.py and # Voice/turn_recorder.py (both pure-stdlib + numpy, no Python-version traps). _PROJECT_ROOT = ( os.environ.get("MARCUS_PROJECT_ROOT") or os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) if _PROJECT_ROOT not in sys.path: sys.path.insert(0, _PROJECT_ROOT) from Voice.audio_io import BuiltinMic from Voice.turn_recorder import TurnRecorder try: from Core.config_loader import load_config _VCFG = load_config("Voice") or {} except Exception: _VCFG = {} _STT = _VCFG.get("stt", {}) # ─── stdout / stderr helpers ────────────────────────────────────── _stdout_lock = threading.Lock() def emit(payload: dict) -> None: """Write one JSON line to stdout. Thread-safe + flushed.""" line = json.dumps(payload, ensure_ascii=False, separators=(",", ":")) with _stdout_lock: sys.stdout.write(line + "\n") sys.stdout.flush() def log(level: str, msg: str) -> None: """Send a log line to the parent (parent forwards to logs/voice.log).""" emit({"type": "log", "level": level, "msg": msg}) # ─── stdin watcher (graceful shutdown) ──────────────────────────── _STOP_REQUESTED = threading.Event() _MIC_HOLDER: list = [] # length-≤1 list — holds the active BuiltinMic def _stdin_watcher() -> None: try: for line in sys.stdin: cmd = line.strip().lower() if cmd == "stop": log("info", "stop received from parent — exiting") _STOP_REQUESTED.set() return elif cmd == "flush": # Parent asks us to drop buffered mic audio (e.g. before # TtsMaker plays a reply, so the robot's own voice doesn't # come back as a fake user utterance). if _MIC_HOLDER: try: _MIC_HOLDER[0].flush() except Exception: pass except Exception: return threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start() def _install_signal_handlers() -> None: def _handle(_signum, _frame): log("info", "signal received — exiting") _STOP_REQUESTED.set() for sig in (signal.SIGTERM, signal.SIGINT): try: signal.signal(sig, _handle) except Exception: pass # ─── tunables (mirrors Voice/gemini_script.py reads) ────────────── _MODEL = os.environ.get( "MARCUS_GEMINI_MODEL", _STT.get("gemini_model", "gemini-2.5-flash-native-audio-preview-12-2025"), ) _DEFAULT_VOICE = os.environ.get( "MARCUS_GEMINI_VOICE", _STT.get("gemini_voice_name", "Charon"), ) _API_KEY = ( os.environ.get("MARCUS_GEMINI_API_KEY") or os.environ.get("SANAD_GEMINI_API_KEY") or _STT.get("gemini_api_key", "") ) _MIC_GAIN = float(_STT.get("mic_gain", 1.0)) _SESSION_TIMEOUT = float(_STT.get("gemini_session_timeout_sec", 660)) _MAX_RECONNECT_DELAY = float(_STT.get("gemini_max_reconnect_delay_sec", 30)) _MAX_CONSECUTIVE_ERRORS = int(_STT.get("gemini_max_consecutive_errors", 10)) _NO_MESSAGES_TIMEOUT = float(_STT.get("gemini_no_messages_timeout_sec", 30)) SEND_SAMPLE_RATE = int(_STT.get("gemini_send_sample_rate", 16000)) CHUNK_SIZE = int(_STT.get("gemini_chunk_size", 512)) _CHUNK_BYTES = CHUNK_SIZE * 2 _REC_ENABLED = bool(_STT.get("gemini_record_enabled", True)) _RECV_RATE = int(_STT.get("gemini_receive_sample_rate", 24000)) _DATA_DIR = os.path.join( _PROJECT_ROOT, _VCFG.get("audio", {}).get("data_dir", "Data/Voice/Recordings"), "gemini_turns", ) _SYS_PROMPT = _STT.get( "gemini_system_prompt", "Transcribe what the user says to Sanad. Stay silent.", ) _SP_FILE = _STT.get("gemini_system_prompt_file", "") if _SP_FILE: _sp_path = ( _SP_FILE if os.path.isabs(_SP_FILE) else os.path.join(_PROJECT_ROOT, _SP_FILE) ) try: with open(_sp_path, "r", encoding="utf-8") as f: txt = f.read().strip() if txt: _SYS_PROMPT = txt except Exception: pass # ─── main async loop ────────────────────────────────────────────── def _build_config(types): vad_start = _STT.get("gemini_vad_start_sensitivity", "START_SENSITIVITY_HIGH") vad_end = _STT.get("gemini_vad_end_sensitivity", "END_SENSITIVITY_LOW") prefix_ms = int(_STT.get("gemini_vad_prefix_padding_ms", 20)) silence_ms = int(_STT.get("gemini_vad_silence_duration_ms", 200)) return types.LiveConnectConfig( response_modalities=["TEXT"], realtime_input_config=types.RealtimeInputConfig( automatic_activity_detection=types.AutomaticActivityDetection( disabled=False, start_of_speech_sensitivity=getattr(types.StartSensitivity, vad_start), end_of_speech_sensitivity=getattr(types.EndSensitivity, vad_end), prefix_padding_ms=prefix_ms, silence_duration_ms=silence_ms, ), ), input_audio_transcription=types.AudioTranscriptionConfig(), system_instruction=types.Content( parts=[types.Part(text=_SYS_PROMPT)], ), ) async def _send_mic_loop(session, types_mod, mic, recorder, done: asyncio.Event) -> None: loop = asyncio.get_event_loop() frame_pause = CHUNK_SIZE / float(SEND_SAMPLE_RATE) last_activity = time.time() while not done.is_set() and not _STOP_REQUESTED.is_set(): try: raw = await loop.run_in_executor(None, mic.read_chunk, _CHUNK_BYTES) except Exception as e: log("warn", f"mic read failed: {e}") break if not raw: await asyncio.sleep(frame_pause) continue if _MIC_GAIN != 1.0: samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16) raw = samples.tobytes() # Per-turn user-audio capture for the WAV recorder. We don't have # Gemini's "is the AI speaking" flag (no audio out), so capture # whenever we have meaningful energy. try: samples_view = np.frombuffer(raw, dtype=np.int16) if samples_view.size and int(np.abs(samples_view).max()) > 250: recorder.capture_user(raw) except Exception: pass now = time.time() if now - last_activity > 10: log("info", f"alive (idle {now - last_activity:.0f}s)") last_activity = now try: await session.send_realtime_input( audio=types_mod.Blob( data=raw, mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}", ), ) except asyncio.CancelledError: return except Exception as e: log("warn", f"mic send failed: {e}") done.set() return await asyncio.sleep(frame_pause) async def _receive_loop(session, recorder, done: asyncio.Event) -> None: last_recv = time.time() try: while not done.is_set() and not _STOP_REQUESTED.is_set(): async for response in session.receive(): last_recv = time.time() if done.is_set(): break if (hasattr(response, "go_away") and getattr(response, "go_away", None) is not None): emit({"type": "reconnect", "reason": "server go_away"}) done.set() return sc = getattr(response, "server_content", None) if sc is None: continue it = getattr(sc, "input_transcription", None) if it is not None: text = (getattr(it, "text", "") or "").strip() if text: emit({"type": "user", "text": text}) try: recorder.add_user_text(text) except Exception: pass mt = getattr(sc, "model_turn", None) if mt is not None: for part in getattr(mt, "parts", []) or []: txt = getattr(part, "text", None) if txt: txt = txt.strip() if txt: emit({"type": "bot", "text": txt}) try: recorder.add_robot_text(txt) except Exception: pass if getattr(sc, "turn_complete", False): try: recorder.finish_turn() except Exception: pass emit({"type": "turn_end"}) if time.time() - last_recv > _NO_MESSAGES_TIMEOUT: log("warn", f"no messages from Gemini for {_NO_MESSAGES_TIMEOUT:.0f}s") break await asyncio.sleep(0.1) except asyncio.CancelledError: return except Exception as e: log("warn", f"receive ended: {e}") finally: done.set() async def main_async() -> int: if not _API_KEY: log("error", "no Gemini API key (set MARCUS_GEMINI_API_KEY)") return 3 try: from google import genai from google.genai import types except Exception as e: log("error", f"google-genai not importable: {e}") return 2 try: client = genai.Client(api_key=_API_KEY) except Exception as e: log("error", f"failed to create Gemini client: {e}") return 4 config = _build_config(types) mic = BuiltinMic() mic.start() _MIC_HOLDER.append(mic) # expose to the stdin "flush" watcher recorder = TurnRecorder( enabled=_REC_ENABLED, out_dir=_DATA_DIR, user_rate=SEND_SAMPLE_RATE, robot_rate=_RECV_RATE, ) session_num = 0 consecutive_errors = 0 start = time.time() rc = 0 try: while not _STOP_REQUESTED.is_set(): session_num += 1 uptime_min = (time.time() - start) / 60 try: log("info", f"connecting (session #{session_num}, uptime {uptime_min:.0f}m)") async with client.aio.live.connect(model=_MODEL, config=config) as session: emit({"type": "ready"}) consecutive_errors = 0 mic.flush() done = asyncio.Event() try: await asyncio.wait_for( asyncio.gather( _send_mic_loop(session, types, mic, recorder, done), _receive_loop(session, recorder, done), ), timeout=_SESSION_TIMEOUT, ) except asyncio.TimeoutError: log("info", f"session timed out after {_SESSION_TIMEOUT:.0f}s") except asyncio.CancelledError: pass log("info", f"session #{session_num} ended — reconnecting in 1s") try: mic.flush() except Exception: pass if _STOP_REQUESTED.is_set(): break await asyncio.sleep(1) except asyncio.CancelledError: break except Exception as e: consecutive_errors += 1 delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors) log("error", f"session error #{consecutive_errors}: {e} — retry in {delay:.0f}s") try: await asyncio.sleep(delay) except asyncio.CancelledError: break if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS: log("warn", f"{consecutive_errors} consecutive errors — recreating client") try: client = genai.Client(api_key=_API_KEY) consecutive_errors = 0 except Exception as ce: log("error", f"client recreation failed: {ce}") finally: try: mic.stop() except Exception: pass return rc def main() -> int: _install_signal_handlers() try: return asyncio.run(main_async()) except KeyboardInterrupt: return 0 except Exception as e: log("error", f"fatal: {e}") return 4 if __name__ == "__main__": sys.exit(main())