Marcus/Voice/gemini_runner.py

441 lines
15 KiB
Python

#!/usr/bin/env python3
"""Voice/gemini_runner.py — Gemini Live STT subprocess.
Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so it
can import `google-genai`, which doesn't support Python 3.8. The marcus env
itself is pinned to Python 3.8 by the NVIDIA Jetson torch wheel, so Gemini
has to live in its own process — the same pattern Sanad uses.
The marcus parent process spawns this script via:
/path/to/gemini_sdk/python -u Voice/gemini_runner.py
and parses the JSON-lines stream we emit on stdout. The parent never sees
audio bytes — this script owns the mic, the Gemini WebSocket, AND the WAV
recording, so the IPC boundary stays narrow (just transcripts).
────────────────────────────────────────────────────────────────────────
Stdout protocol (one JSON object per line, UTF-8):
{"type":"ready"} session connected, mic is live
{"type":"user", "text":"..."} user input transcription
{"type":"bot", "text":"..."} Gemini's text reply (logged only — never spoken)
{"type":"turn_end"} Gemini emitted turn_complete
{"type":"reconnect", "reason":"..."} session ended, will reconnect
{"type":"log", "level":"info|warn|error", "msg":"..."}
Stdin protocol (line-based):
"stop\n" request graceful shutdown
Exit codes:
0 — clean shutdown after "stop" or signal
2 — google-genai not importable
3 — no API key
4 — fatal session loop crash
────────────────────────────────────────────────────────────────────────
Env vars:
MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY) — required
MARCUS_GEMINI_MODEL (optional) — model id
MARCUS_GEMINI_VOICE (optional, ignored in TEXT mode)
MARCUS_PROJECT_ROOT (optional) — for sys.path
This file uses Python 3.10+ syntax — type unions with `|`, etc. — because
the gemini_sdk env is 3.10+. DO NOT try to import it from marcus 3.8.
"""
from __future__ import annotations
import asyncio
import json
import os
import signal
import sys
import threading
import time
from typing import Any
import numpy as np
# Make the Marcus project importable so we can reuse Voice/audio_io.py and
# Voice/turn_recorder.py (both pure-stdlib + numpy, no Python-version traps).
_PROJECT_ROOT = (
os.environ.get("MARCUS_PROJECT_ROOT")
or os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
from Voice.audio_io import BuiltinMic
from Voice.turn_recorder import TurnRecorder
try:
from Core.config_loader import load_config
_VCFG = load_config("Voice") or {}
except Exception:
_VCFG = {}
_STT = _VCFG.get("stt", {})
# ─── stdout / stderr helpers ──────────────────────────────────────
_stdout_lock = threading.Lock()
def emit(payload: dict) -> None:
"""Write one JSON line to stdout. Thread-safe + flushed."""
line = json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
with _stdout_lock:
sys.stdout.write(line + "\n")
sys.stdout.flush()
def log(level: str, msg: str) -> None:
"""Send a log line to the parent (parent forwards to logs/voice.log)."""
emit({"type": "log", "level": level, "msg": msg})
# ─── stdin watcher (graceful shutdown) ────────────────────────────
_STOP_REQUESTED = threading.Event()
_MIC_HOLDER: list = [] # length-≤1 list — holds the active BuiltinMic
def _stdin_watcher() -> None:
try:
for line in sys.stdin:
cmd = line.strip().lower()
if cmd == "stop":
log("info", "stop received from parent — exiting")
_STOP_REQUESTED.set()
return
elif cmd == "flush":
# Parent asks us to drop buffered mic audio (e.g. before
# TtsMaker plays a reply, so the robot's own voice doesn't
# come back as a fake user utterance).
if _MIC_HOLDER:
try:
_MIC_HOLDER[0].flush()
except Exception:
pass
except Exception:
return
threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()
def _install_signal_handlers() -> None:
def _handle(_signum, _frame):
log("info", "signal received — exiting")
_STOP_REQUESTED.set()
for sig in (signal.SIGTERM, signal.SIGINT):
try:
signal.signal(sig, _handle)
except Exception:
pass
# ─── tunables (mirrors Voice/gemini_script.py reads) ──────────────
_MODEL = os.environ.get(
"MARCUS_GEMINI_MODEL",
_STT.get("gemini_model", "gemini-2.5-flash-native-audio-preview-12-2025"),
)
_DEFAULT_VOICE = os.environ.get(
"MARCUS_GEMINI_VOICE",
_STT.get("gemini_voice_name", "Charon"),
)
_API_KEY = (
os.environ.get("MARCUS_GEMINI_API_KEY")
or os.environ.get("SANAD_GEMINI_API_KEY")
or _STT.get("gemini_api_key", "")
)
_MIC_GAIN = float(_STT.get("mic_gain", 1.0))
_SESSION_TIMEOUT = float(_STT.get("gemini_session_timeout_sec", 660))
_MAX_RECONNECT_DELAY = float(_STT.get("gemini_max_reconnect_delay_sec", 30))
_MAX_CONSECUTIVE_ERRORS = int(_STT.get("gemini_max_consecutive_errors", 10))
_NO_MESSAGES_TIMEOUT = float(_STT.get("gemini_no_messages_timeout_sec", 30))
SEND_SAMPLE_RATE = int(_STT.get("gemini_send_sample_rate", 16000))
CHUNK_SIZE = int(_STT.get("gemini_chunk_size", 512))
_CHUNK_BYTES = CHUNK_SIZE * 2
_REC_ENABLED = bool(_STT.get("gemini_record_enabled", True))
_RECV_RATE = int(_STT.get("gemini_receive_sample_rate", 24000))
_DATA_DIR = os.path.join(
_PROJECT_ROOT,
_VCFG.get("audio", {}).get("data_dir", "Data/Voice/Recordings"),
"gemini_turns",
)
_SYS_PROMPT = _STT.get(
"gemini_system_prompt",
"Transcribe what the user says to Sanad. Stay silent.",
)
_SP_FILE = _STT.get("gemini_system_prompt_file", "")
if _SP_FILE:
_sp_path = (
_SP_FILE if os.path.isabs(_SP_FILE)
else os.path.join(_PROJECT_ROOT, _SP_FILE)
)
try:
with open(_sp_path, "r", encoding="utf-8") as f:
txt = f.read().strip()
if txt:
_SYS_PROMPT = txt
except Exception:
pass
# ─── main async loop ──────────────────────────────────────────────
def _build_config(types):
vad_start = _STT.get("gemini_vad_start_sensitivity", "START_SENSITIVITY_HIGH")
vad_end = _STT.get("gemini_vad_end_sensitivity", "END_SENSITIVITY_LOW")
prefix_ms = int(_STT.get("gemini_vad_prefix_padding_ms", 20))
silence_ms = int(_STT.get("gemini_vad_silence_duration_ms", 200))
return types.LiveConnectConfig(
response_modalities=["TEXT"],
realtime_input_config=types.RealtimeInputConfig(
automatic_activity_detection=types.AutomaticActivityDetection(
disabled=False,
start_of_speech_sensitivity=getattr(types.StartSensitivity, vad_start),
end_of_speech_sensitivity=getattr(types.EndSensitivity, vad_end),
prefix_padding_ms=prefix_ms,
silence_duration_ms=silence_ms,
),
),
input_audio_transcription=types.AudioTranscriptionConfig(),
system_instruction=types.Content(
parts=[types.Part(text=_SYS_PROMPT)],
),
)
async def _send_mic_loop(session, types_mod, mic, recorder, done: asyncio.Event) -> None:
loop = asyncio.get_event_loop()
frame_pause = CHUNK_SIZE / float(SEND_SAMPLE_RATE)
last_activity = time.time()
while not done.is_set() and not _STOP_REQUESTED.is_set():
try:
raw = await loop.run_in_executor(None, mic.read_chunk, _CHUNK_BYTES)
except Exception as e:
log("warn", f"mic read failed: {e}")
break
if not raw:
await asyncio.sleep(frame_pause)
continue
if _MIC_GAIN != 1.0:
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
raw = samples.tobytes()
# Per-turn user-audio capture for the WAV recorder. We don't have
# Gemini's "is the AI speaking" flag (no audio out), so capture
# whenever we have meaningful energy.
try:
samples_view = np.frombuffer(raw, dtype=np.int16)
if samples_view.size and int(np.abs(samples_view).max()) > 250:
recorder.capture_user(raw)
except Exception:
pass
now = time.time()
if now - last_activity > 10:
log("info", f"alive (idle {now - last_activity:.0f}s)")
last_activity = now
try:
await session.send_realtime_input(
audio=types_mod.Blob(
data=raw,
mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
),
)
except asyncio.CancelledError:
return
except Exception as e:
log("warn", f"mic send failed: {e}")
done.set()
return
await asyncio.sleep(frame_pause)
async def _receive_loop(session, recorder, done: asyncio.Event) -> None:
last_recv = time.time()
try:
while not done.is_set() and not _STOP_REQUESTED.is_set():
async for response in session.receive():
last_recv = time.time()
if done.is_set():
break
if (hasattr(response, "go_away")
and getattr(response, "go_away", None) is not None):
emit({"type": "reconnect", "reason": "server go_away"})
done.set()
return
sc = getattr(response, "server_content", None)
if sc is None:
continue
it = getattr(sc, "input_transcription", None)
if it is not None:
text = (getattr(it, "text", "") or "").strip()
if text:
emit({"type": "user", "text": text})
try:
recorder.add_user_text(text)
except Exception:
pass
mt = getattr(sc, "model_turn", None)
if mt is not None:
for part in getattr(mt, "parts", []) or []:
txt = getattr(part, "text", None)
if txt:
txt = txt.strip()
if txt:
emit({"type": "bot", "text": txt})
try:
recorder.add_robot_text(txt)
except Exception:
pass
if getattr(sc, "turn_complete", False):
try:
recorder.finish_turn()
except Exception:
pass
emit({"type": "turn_end"})
if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
log("warn", f"no messages from Gemini for {_NO_MESSAGES_TIMEOUT:.0f}s")
break
await asyncio.sleep(0.1)
except asyncio.CancelledError:
return
except Exception as e:
log("warn", f"receive ended: {e}")
finally:
done.set()
async def main_async() -> int:
if not _API_KEY:
log("error", "no Gemini API key (set MARCUS_GEMINI_API_KEY)")
return 3
try:
from google import genai
from google.genai import types
except Exception as e:
log("error", f"google-genai not importable: {e}")
return 2
try:
client = genai.Client(api_key=_API_KEY)
except Exception as e:
log("error", f"failed to create Gemini client: {e}")
return 4
config = _build_config(types)
mic = BuiltinMic()
mic.start()
_MIC_HOLDER.append(mic) # expose to the stdin "flush" watcher
recorder = TurnRecorder(
enabled=_REC_ENABLED,
out_dir=_DATA_DIR,
user_rate=SEND_SAMPLE_RATE,
robot_rate=_RECV_RATE,
)
session_num = 0
consecutive_errors = 0
start = time.time()
rc = 0
try:
while not _STOP_REQUESTED.is_set():
session_num += 1
uptime_min = (time.time() - start) / 60
try:
log("info", f"connecting (session #{session_num}, uptime {uptime_min:.0f}m)")
async with client.aio.live.connect(model=_MODEL, config=config) as session:
emit({"type": "ready"})
consecutive_errors = 0
mic.flush()
done = asyncio.Event()
try:
await asyncio.wait_for(
asyncio.gather(
_send_mic_loop(session, types, mic, recorder, done),
_receive_loop(session, recorder, done),
),
timeout=_SESSION_TIMEOUT,
)
except asyncio.TimeoutError:
log("info", f"session timed out after {_SESSION_TIMEOUT:.0f}s")
except asyncio.CancelledError:
pass
log("info", f"session #{session_num} ended — reconnecting in 1s")
try:
mic.flush()
except Exception:
pass
if _STOP_REQUESTED.is_set():
break
await asyncio.sleep(1)
except asyncio.CancelledError:
break
except Exception as e:
consecutive_errors += 1
delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
log("error", f"session error #{consecutive_errors}: {e} — retry in {delay:.0f}s")
try:
await asyncio.sleep(delay)
except asyncio.CancelledError:
break
if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
log("warn", f"{consecutive_errors} consecutive errors — recreating client")
try:
client = genai.Client(api_key=_API_KEY)
consecutive_errors = 0
except Exception as ce:
log("error", f"client recreation failed: {ce}")
finally:
try:
mic.stop()
except Exception:
pass
return rc
def main() -> int:
_install_signal_handlers()
try:
return asyncio.run(main_async())
except KeyboardInterrupt:
return 0
except Exception as e:
log("error", f"fatal: {e}")
return 4
if __name__ == "__main__":
sys.exit(main())