Marcus/Voice/gemini_runner.py

#!/usr/bin/env python3
"""Voice/gemini_runner.py — Gemini Live STT subprocess.

Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so it
can import `google-genai`, which doesn't support Python 3.8. The marcus env
itself is pinned to Python 3.8 by the NVIDIA Jetson torch wheel, so Gemini
has to live in its own process — the same pattern Sanad uses.

The marcus parent process spawns this script via:

    /path/to/gemini_sdk/python -u Voice/gemini_runner.py

and parses the JSON-lines stream we emit on stdout. The parent never sees
audio bytes — this script owns the mic, the Gemini WebSocket, AND the WAV
recording, so the IPC boundary stays narrow (just transcripts).

────────────────────────────────────────────────────────────────────────
Stdout protocol (one JSON object per line, UTF-8):
    {"type":"ready"}                          session connected, mic is live
    {"type":"user",   "text":"..."}           user input transcription
    {"type":"bot",    "text":"..."}           Gemini's text reply (logged only — never spoken)
    {"type":"turn_end"}                       Gemini emitted turn_complete
    {"type":"reconnect", "reason":"..."}      session ended, will reconnect
    {"type":"log", "level":"info|warn|error", "msg":"..."}

Stdin protocol (line-based):
    "stop\n"                                   request graceful shutdown

Exit codes:
    0 — clean shutdown after "stop" or signal
    2 — google-genai not importable
    3 — no API key
    4 — fatal session loop crash
────────────────────────────────────────────────────────────────────────

Env vars:
    MARCUS_GEMINI_API_KEY    (or SANAD_GEMINI_API_KEY)   — required
    MARCUS_GEMINI_MODEL      (optional)                   — model id
    MARCUS_GEMINI_VOICE      (optional, ignored in TEXT mode)
    MARCUS_PROJECT_ROOT      (optional)                   — for sys.path

This file uses Python 3.10+ syntax — type unions with `|`, etc. — because
the gemini_sdk env is 3.10+. DO NOT try to import it from marcus 3.8.
"""

from __future__ import annotations

import asyncio
import json
import os
import signal
import sys
import threading
import time
from typing import Any

import numpy as np

# Make the Marcus project importable so we can reuse Voice/audio_io.py and
# Voice/turn_recorder.py (both pure-stdlib + numpy, no Python-version traps).
_PROJECT_ROOT = (
    os.environ.get("MARCUS_PROJECT_ROOT")
    or os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)

from Voice.audio_io import BuiltinMic
from Voice.turn_recorder import TurnRecorder

try:
    from Core.config_loader import load_config
    _VCFG = load_config("Voice") or {}
except Exception:
    _VCFG = {}

_STT = _VCFG.get("stt", {})


# ─── stdout / stderr helpers ──────────────────────────────────────

_stdout_lock = threading.Lock()


def emit(payload: dict) -> None:
    """Write one JSON line to stdout. Thread-safe + flushed."""
    line = json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
    with _stdout_lock:
        sys.stdout.write(line + "\n")
        sys.stdout.flush()


def log(level: str, msg: str) -> None:
    """Send a log line to the parent (parent forwards to logs/voice.log)."""
    emit({"type": "log", "level": level, "msg": msg})


# ─── stdin watcher (graceful shutdown) ────────────────────────────


_STOP_REQUESTED = threading.Event()
_MIC_HOLDER: list = []          # length-≤1 list — holds the active BuiltinMic


def _stdin_watcher() -> None:
    try:
        for line in sys.stdin:
            cmd = line.strip().lower()
            if cmd == "stop":
                log("info", "stop received from parent — exiting")
                _STOP_REQUESTED.set()
                return
            elif cmd == "flush":
                # Parent asks us to drop buffered mic audio (e.g. before
                # TtsMaker plays a reply, so the robot's own voice doesn't
                # come back as a fake user utterance).
                if _MIC_HOLDER:
                    try:
                        _MIC_HOLDER[0].flush()
                    except Exception:
                        pass
    except Exception:
        return


threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()


def _install_signal_handlers() -> None:
    def _handle(_signum, _frame):
        log("info", "signal received — exiting")
        _STOP_REQUESTED.set()
    for sig in (signal.SIGTERM, signal.SIGINT):
        try:
            signal.signal(sig, _handle)
        except Exception:
            pass


# ─── tunables (mirrors Voice/gemini_script.py reads) ──────────────

_MODEL = os.environ.get(
    "MARCUS_GEMINI_MODEL",
    _STT.get("gemini_model", "gemini-2.5-flash-native-audio-preview-12-2025"),
)
_DEFAULT_VOICE = os.environ.get(
    "MARCUS_GEMINI_VOICE",
    _STT.get("gemini_voice_name", "Charon"),
)

_API_KEY = (
    os.environ.get("MARCUS_GEMINI_API_KEY")
    or os.environ.get("SANAD_GEMINI_API_KEY")
    or _STT.get("gemini_api_key", "")
)

_MIC_GAIN = float(_STT.get("mic_gain", 1.0))
_SESSION_TIMEOUT = float(_STT.get("gemini_session_timeout_sec", 660))
_MAX_RECONNECT_DELAY = float(_STT.get("gemini_max_reconnect_delay_sec", 30))
_MAX_CONSECUTIVE_ERRORS = int(_STT.get("gemini_max_consecutive_errors", 10))
_NO_MESSAGES_TIMEOUT = float(_STT.get("gemini_no_messages_timeout_sec", 30))

SEND_SAMPLE_RATE = int(_STT.get("gemini_send_sample_rate", 16000))
CHUNK_SIZE = int(_STT.get("gemini_chunk_size", 512))
_CHUNK_BYTES = CHUNK_SIZE * 2

_REC_ENABLED = bool(_STT.get("gemini_record_enabled", True))
_RECV_RATE = int(_STT.get("gemini_receive_sample_rate", 24000))
_DATA_DIR = os.path.join(
    _PROJECT_ROOT,
    _VCFG.get("audio", {}).get("data_dir", "Data/Voice/Recordings"),
    "gemini_turns",
)

_SYS_PROMPT = _STT.get(
    "gemini_system_prompt",
    "Transcribe what the user says to Sanad. Stay silent.",
)
_SP_FILE = _STT.get("gemini_system_prompt_file", "")
if _SP_FILE:
    _sp_path = (
        _SP_FILE if os.path.isabs(_SP_FILE)
        else os.path.join(_PROJECT_ROOT, _SP_FILE)
    )
    try:
        with open(_sp_path, "r", encoding="utf-8") as f:
            txt = f.read().strip()
        if txt:
            _SYS_PROMPT = txt
    except Exception:
        pass


# ─── main async loop ──────────────────────────────────────────────


def _build_config(types):
    vad_start = _STT.get("gemini_vad_start_sensitivity", "START_SENSITIVITY_HIGH")
    vad_end = _STT.get("gemini_vad_end_sensitivity", "END_SENSITIVITY_LOW")
    prefix_ms = int(_STT.get("gemini_vad_prefix_padding_ms", 20))
    silence_ms = int(_STT.get("gemini_vad_silence_duration_ms", 200))

    return types.LiveConnectConfig(
        response_modalities=["TEXT"],
        realtime_input_config=types.RealtimeInputConfig(
            automatic_activity_detection=types.AutomaticActivityDetection(
                disabled=False,
                start_of_speech_sensitivity=getattr(types.StartSensitivity, vad_start),
                end_of_speech_sensitivity=getattr(types.EndSensitivity, vad_end),
                prefix_padding_ms=prefix_ms,
                silence_duration_ms=silence_ms,
            ),
        ),
        input_audio_transcription=types.AudioTranscriptionConfig(),
        system_instruction=types.Content(
            parts=[types.Part(text=_SYS_PROMPT)],
        ),
    )


async def _send_mic_loop(session, types_mod, mic, recorder, done: asyncio.Event) -> None:
    loop = asyncio.get_event_loop()
    frame_pause = CHUNK_SIZE / float(SEND_SAMPLE_RATE)
    last_activity = time.time()

    while not done.is_set() and not _STOP_REQUESTED.is_set():
        try:
            raw = await loop.run_in_executor(None, mic.read_chunk, _CHUNK_BYTES)
        except Exception as e:
            log("warn", f"mic read failed: {e}")
            break

        if not raw:
            await asyncio.sleep(frame_pause)
            continue

        if _MIC_GAIN != 1.0:
            samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
            samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
            raw = samples.tobytes()

        # Per-turn user-audio capture for the WAV recorder. We don't have
        # Gemini's "is the AI speaking" flag (no audio out), so capture
        # whenever we have meaningful energy.
        try:
            samples_view = np.frombuffer(raw, dtype=np.int16)
            if samples_view.size and int(np.abs(samples_view).max()) > 250:
                recorder.capture_user(raw)
        except Exception:
            pass

        now = time.time()
        if now - last_activity > 10:
            log("info", f"alive (idle {now - last_activity:.0f}s)")
            last_activity = now

        try:
            await session.send_realtime_input(
                audio=types_mod.Blob(
                    data=raw,
                    mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
                ),
            )
        except asyncio.CancelledError:
            return
        except Exception as e:
            log("warn", f"mic send failed: {e}")
            done.set()
            return

        await asyncio.sleep(frame_pause)


async def _receive_loop(session, recorder, done: asyncio.Event) -> None:
    last_recv = time.time()
    try:
        while not done.is_set() and not _STOP_REQUESTED.is_set():
            async for response in session.receive():
                last_recv = time.time()
                if done.is_set():
                    break

                if (hasattr(response, "go_away")
                        and getattr(response, "go_away", None) is not None):
                    emit({"type": "reconnect", "reason": "server go_away"})
                    done.set()
                    return

                sc = getattr(response, "server_content", None)
                if sc is None:
                    continue

                it = getattr(sc, "input_transcription", None)
                if it is not None:
                    text = (getattr(it, "text", "") or "").strip()
                    if text:
                        emit({"type": "user", "text": text})
                        try:
                            recorder.add_user_text(text)
                        except Exception:
                            pass

                mt = getattr(sc, "model_turn", None)
                if mt is not None:
                    for part in getattr(mt, "parts", []) or []:
                        txt = getattr(part, "text", None)
                        if txt:
                            txt = txt.strip()
                            if txt:
                                emit({"type": "bot", "text": txt})
                                try:
                                    recorder.add_robot_text(txt)
                                except Exception:
                                    pass

                if getattr(sc, "turn_complete", False):
                    try:
                        recorder.finish_turn()
                    except Exception:
                        pass
                    emit({"type": "turn_end"})

            if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
                log("warn", f"no messages from Gemini for {_NO_MESSAGES_TIMEOUT:.0f}s")
                break
            await asyncio.sleep(0.1)
    except asyncio.CancelledError:
        return
    except Exception as e:
        log("warn", f"receive ended: {e}")
    finally:
        done.set()


async def main_async() -> int:
    if not _API_KEY:
        log("error", "no Gemini API key (set MARCUS_GEMINI_API_KEY)")
        return 3

    try:
        from google import genai
        from google.genai import types
    except Exception as e:
        log("error", f"google-genai not importable: {e}")
        return 2

    try:
        client = genai.Client(api_key=_API_KEY)
    except Exception as e:
        log("error", f"failed to create Gemini client: {e}")
        return 4

    config = _build_config(types)
    mic = BuiltinMic()
    mic.start()
    _MIC_HOLDER.append(mic)         # expose to the stdin "flush" watcher

    recorder = TurnRecorder(
        enabled=_REC_ENABLED,
        out_dir=_DATA_DIR,
        user_rate=SEND_SAMPLE_RATE,
        robot_rate=_RECV_RATE,
    )

    session_num = 0
    consecutive_errors = 0
    start = time.time()
    rc = 0

    try:
        while not _STOP_REQUESTED.is_set():
            session_num += 1
            uptime_min = (time.time() - start) / 60
            try:
                log("info", f"connecting (session #{session_num}, uptime {uptime_min:.0f}m)")
                async with client.aio.live.connect(model=_MODEL, config=config) as session:
                    emit({"type": "ready"})
                    consecutive_errors = 0
                    mic.flush()
                    done = asyncio.Event()
                    try:
                        await asyncio.wait_for(
                            asyncio.gather(
                                _send_mic_loop(session, types, mic, recorder, done),
                                _receive_loop(session, recorder, done),
                            ),
                            timeout=_SESSION_TIMEOUT,
                        )
                    except asyncio.TimeoutError:
                        log("info", f"session timed out after {_SESSION_TIMEOUT:.0f}s")
                    except asyncio.CancelledError:
                        pass

                    log("info", f"session #{session_num} ended — reconnecting in 1s")
                    try:
                        mic.flush()
                    except Exception:
                        pass
                    if _STOP_REQUESTED.is_set():
                        break
                    await asyncio.sleep(1)
            except asyncio.CancelledError:
                break
            except Exception as e:
                consecutive_errors += 1
                delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
                log("error", f"session error #{consecutive_errors}: {e} — retry in {delay:.0f}s")
                try:
                    await asyncio.sleep(delay)
                except asyncio.CancelledError:
                    break
                if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
                    log("warn", f"{consecutive_errors} consecutive errors — recreating client")
                    try:
                        client = genai.Client(api_key=_API_KEY)
                        consecutive_errors = 0
                    except Exception as ce:
                        log("error", f"client recreation failed: {ce}")
    finally:
        try:
            mic.stop()
        except Exception:
            pass

    return rc


def main() -> int:
    _install_signal_handlers()
    try:
        return asyncio.run(main_async())
    except KeyboardInterrupt:
        return 0
    except Exception as e:
        log("error", f"fatal: {e}")
        return 4


if __name__ == "__main__":
    sys.exit(main())