Marcus/Voice/gemini_runner.py

#!/usr/bin/env python3
"""Voice/gemini_runner.py — Gemini Live S2S subprocess (Option 3).

Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so
it can import `google-genai` (which doesn't support Python 3.8). The
marcus env itself is pinned to Python 3.8 by the NVIDIA Jetson torch
wheel, so Gemini has to live in its own process.

This is the full Sanad-pattern speech-to-speech variant:
  - response_modalities=["AUDIO"]  → Gemini speaks back through G1 speaker
  - input_audio_transcription      → user transcripts emitted on stdout for
                                      Marcus's wake-word side channel
  - output_audio_transcription     → Gemini's reply text logged for review
  - barge-in detection             → user speaking over AI cuts AI off
  - echo suppression               → mic muted during AI playback
  - JPEG camera frames over stdin  → Marcus parent forwards frames; runner
                                      streams them to Gemini Live so the
                                      vision answers ("what do you see")
                                      are correct, not hallucinated.

The runner owns the G1 mic AND the G1 speaker (unitree_sdk2py works fine
in the gemini_sdk env on this Jetson — already verified).

────────────────────────────────────────────────────────────────────────
Stdout protocol (one JSON object per line, UTF-8):
    {"type":"ready"}                       session connected
    {"type":"user",   "text":"..."}        user input transcription
    {"type":"bot",    "text":"..."}        Gemini's reply text (logged only — Gemini also speaks it)
    {"type":"turn_end"}
    {"type":"barge_in"}
    {"type":"reconnect", "reason":"..."}
    {"type":"log", "level":"info|warn|error", "msg":"..."}

Stdin protocol (line-based):
    "stop\n"                                graceful shutdown
    "flush\n"                               drop mic buffer (echo prevention)
    "frame:<base64-jpeg>\n"                 forward a camera frame to Gemini Live
                                              (Marcus parent throttles to ~2 fps)
────────────────────────────────────────────────────────────────────────

Env vars:
    MARCUS_GEMINI_API_KEY    (or SANAD_GEMINI_API_KEY)
    MARCUS_GEMINI_MODEL      (optional)
    MARCUS_GEMINI_VOICE      (optional)
    MARCUS_PROJECT_ROOT      (optional)
"""

from __future__ import annotations

import array
import asyncio
import base64
import json
import os
import signal
import sys
import threading
import time
from typing import Any

import numpy as np

_PROJECT_ROOT = (
    os.environ.get("MARCUS_PROJECT_ROOT")
    or os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
if _PROJECT_ROOT not in sys.path:
    sys.path.insert(0, _PROJECT_ROOT)

from Voice.audio_io import BuiltinMic, BuiltinSpeaker
from Voice.turn_recorder import TurnRecorder

try:
    from Core.config_loader import load_config
    _VCFG = load_config("Voice") or {}
except Exception:
    _VCFG = {}

_STT = _VCFG.get("stt", {})
_SPK = _VCFG.get("speaker", {})


# ─── stdout / stderr helpers ─────────────────────────────────────

_stdout_lock = threading.Lock()


def emit(payload: dict) -> None:
    line = json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
    with _stdout_lock:
        sys.stdout.write(line + "\n")
        sys.stdout.flush()


def log(level: str, msg: str) -> None:
    emit({"type": "log", "level": level, "msg": msg})


# ─── stdin watcher (graceful shutdown + flush + frames) ──────────


_STOP_REQUESTED = threading.Event()
_MIC_HOLDER: list = []                       # [BuiltinMic] when active

# Latest JPEG frame from the parent (raw bytes). The frame-send loop
# picks this up and ships it to Gemini Live.
_LATEST_FRAME_LOCK = threading.Lock()
_LATEST_FRAME: dict = {"bytes": None, "ts": 0.0}


def _stdin_watcher() -> None:
    try:
        for line in sys.stdin:
            line = line.rstrip("\n")
            if not line:
                continue
            # Match the cheap commands first, then fall through to frame:
            cmd = line.lower()
            if cmd == "stop":
                log("info", "stop received from parent — exiting")
                _STOP_REQUESTED.set()
                return
            elif cmd == "flush":
                if _MIC_HOLDER:
                    try:
                        _MIC_HOLDER[0].flush()
                    except Exception:
                        pass
            elif line.startswith("frame:"):
                b64 = line[len("frame:"):]
                try:
                    data = base64.b64decode(b64)
                except Exception:
                    continue
                if data:
                    with _LATEST_FRAME_LOCK:
                        _LATEST_FRAME["bytes"] = data
                        _LATEST_FRAME["ts"] = time.time()
    except Exception:
        return


threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()


def _install_signal_handlers() -> None:
    def _handle(_signum, _frame):
        log("info", "signal received — exiting")
        _STOP_REQUESTED.set()
    for sig in (signal.SIGTERM, signal.SIGINT):
        try:
            signal.signal(sig, _handle)
        except Exception:
            pass


# ─── tunables ────────────────────────────────────────────────────

_MODEL = os.environ.get(
    "MARCUS_GEMINI_MODEL",
    _STT.get("gemini_model", "gemini-2.5-flash-native-audio-preview-12-2025"),
)
_VOICE = os.environ.get(
    "MARCUS_GEMINI_VOICE",
    _STT.get("gemini_voice_name", "Charon"),
)
_API_KEY = (
    os.environ.get("MARCUS_GEMINI_API_KEY")
    or os.environ.get("SANAD_GEMINI_API_KEY")
    or _STT.get("gemini_api_key", "")
)

_MIC_GAIN = float(_STT.get("mic_gain", 1.0))
_SESSION_TIMEOUT = float(_STT.get("gemini_session_timeout_sec", 660))
_MAX_RECONNECT_DELAY = float(_STT.get("gemini_max_reconnect_delay_sec", 30))
_MAX_CONSECUTIVE_ERRORS = int(_STT.get("gemini_max_consecutive_errors", 10))
_NO_MESSAGES_TIMEOUT = float(_STT.get("gemini_no_messages_timeout_sec", 30))

SEND_SAMPLE_RATE = int(_STT.get("gemini_send_sample_rate", 16000))
RECEIVE_SAMPLE_RATE = int(_STT.get("gemini_receive_sample_rate", 24000))
CHUNK_SIZE = int(_STT.get("gemini_chunk_size", 512))
_CHUNK_BYTES = CHUNK_SIZE * 2
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES

_BARGE_THRESHOLD = int(_STT.get("gemini_barge_threshold", 500))
_BARGE_CHUNKS = int(_STT.get("gemini_barge_loud_chunks_needed", 3))
_BARGE_COOLDOWN = float(_STT.get("gemini_barge_cooldown_sec", 0.3))
_ECHO_SUPPRESS_BELOW = int(_STT.get("gemini_echo_suppress_below", 500))
_AI_GRACE = float(_STT.get("gemini_ai_speak_grace_sec", 0.15))

_FRAME_INTERVAL = float(_STT.get("gemini_frame_interval_sec", 0.5))
_FRAME_MAX_AGE = float(_STT.get("gemini_frame_max_age_sec", 1.5))

_REC_ENABLED = bool(_STT.get("gemini_record_enabled", True))
_DATA_DIR = os.path.join(
    _PROJECT_ROOT,
    _VCFG.get("audio", {}).get("data_dir", "Data/Voice/Recordings"),
    "gemini_turns",
)

_SYS_PROMPT = (
    os.environ.get("MARCUS_GEMINI_SYSTEM_PROMPT")
    or _STT.get(
        "gemini_system_prompt",
        "You are Sanad, a humanoid robot assistant. Reply briefly.",
    )
)
_SP_FILE = _STT.get("gemini_system_prompt_file", "")
if _SP_FILE:
    _sp_path = (
        _SP_FILE if os.path.isabs(_SP_FILE)
        else os.path.join(_PROJECT_ROOT, _SP_FILE)
    )
    try:
        with open(_sp_path, "r", encoding="utf-8") as f:
            txt = f.read().strip()
        if txt:
            _SYS_PROMPT = txt
    except Exception:
        pass


def _audio_energy(pcm: bytes) -> int:
    try:
        samples = array.array("h", pcm)
        return sum(abs(s) for s in samples) // len(samples) if samples else 0
    except Exception:
        return 0


# ─── G1 speaker (audio playback) — initialized in main_async ────


def _init_g1_speaker() -> BuiltinSpeaker | None:
    """Initialise the G1 DDS audio client and wrap it in a BuiltinSpeaker.

    Sanad's audio_io.BuiltinSpeaker normally takes an already-initialised
    AudioClient. This subprocess owns its own DDS init.
    """
    try:
        from unitree_sdk2py.core.channel import ChannelFactoryInitialize
        from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
    except Exception as e:
        log("error", f"unitree_sdk2py not importable in this env: {e}")
        return None

    iface = _SPK.get("dds_interface", "eth0")
    try:
        ChannelFactoryInitialize(0, iface)
    except Exception as e:
        # Already initialised in this process — that's fine.
        log("warn", f"ChannelFactoryInitialize: {e}")

    try:
        ac = AudioClient()
        ac.SetTimeout(10.0)
        ac.Init()
        try:
            ac.SetVolume(int(_SPK.get("volume", 100)))
        except Exception:
            pass
    except Exception as e:
        log("error", f"AudioClient init failed: {e}")
        return None

    return BuiltinSpeaker(ac, app_name=_SPK.get("app_name", "sanad"))


# ─── per-session state (reset on each connect) ───────────────────


class _Session:
    def __init__(self):
        self.speaking = False
        self.stream_started = False
        self.barge_block_until = 0.0
        self.ai_speak_start = 0.0
        self.last_ai_audio = 0.0


# ─── main async loop ─────────────────────────────────────────────


def _build_config(types):
    vad_start = _STT.get("gemini_vad_start_sensitivity", "START_SENSITIVITY_HIGH")
    vad_end = _STT.get("gemini_vad_end_sensitivity", "END_SENSITIVITY_LOW")
    prefix_ms = int(_STT.get("gemini_vad_prefix_padding_ms", 20))
    silence_ms = int(_STT.get("gemini_vad_silence_duration_ms", 200))

    return types.LiveConnectConfig(
        response_modalities=["AUDIO"],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name=_VOICE,
                ),
            ),
        ),
        realtime_input_config=types.RealtimeInputConfig(
            automatic_activity_detection=types.AutomaticActivityDetection(
                disabled=False,
                start_of_speech_sensitivity=getattr(types.StartSensitivity, vad_start),
                end_of_speech_sensitivity=getattr(types.EndSensitivity, vad_end),
                prefix_padding_ms=prefix_ms,
                silence_duration_ms=silence_ms,
            ),
        ),
        input_audio_transcription=types.AudioTranscriptionConfig(),
        output_audio_transcription=types.AudioTranscriptionConfig(),
        system_instruction=types.Content(
            parts=[types.Part(text=_SYS_PROMPT)],
        ),
    )


async def _send_mic_loop(session, types_mod, mic, speaker, recorder, sess: _Session, done: asyncio.Event) -> None:
    loop = asyncio.get_event_loop()
    frame_pause = CHUNK_SIZE / float(SEND_SAMPLE_RATE)
    loud_count = 0
    last_activity = time.time()

    while not done.is_set() and not _STOP_REQUESTED.is_set():
        try:
            raw = await loop.run_in_executor(None, mic.read_chunk, _CHUNK_BYTES)
        except Exception as e:
            log("warn", f"mic read failed: {e}")
            break

        if not raw:
            await asyncio.sleep(frame_pause)
            continue

        if _MIC_GAIN != 1.0:
            samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
            samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
            raw = samples.tobytes()

        energy = _audio_energy(raw)
        now = time.time()

        # Barge-in detection — sustained user energy interrupts the AI.
        if sess.speaking and now >= sess.barge_block_until:
            if (now - sess.ai_speak_start) >= _AI_GRACE:
                if energy > _BARGE_THRESHOLD:
                    loud_count += 1
                else:
                    loud_count = max(0, loud_count - 1)
                if loud_count > _BARGE_CHUNKS:
                    log("info", f"BARGE-IN (e={energy})")
                    emit({"type": "barge_in"})
                    sess.speaking = False
                    sess.stream_started = False
                    try:
                        speaker.stop()
                    except Exception:
                        pass
                    try:
                        mic.flush()
                    except Exception:
                        pass
                    try:
                        recorder.finish_turn()
                    except Exception:
                        pass
                    loud_count = 0
                    sess.barge_block_until = now + _BARGE_COOLDOWN

        # Echo suppression — while AI speaks, mute quiet mic frames so the
        # mic doesn't feed Gemini its own voice.
        send_data = raw
        if sess.speaking and energy < _ECHO_SUPPRESS_BELOW:
            send_data = _SILENCE_PCM

        # Capture user audio for the per-turn WAV (only when user actually speaks).
        if energy > 250 and not sess.speaking:
            try:
                recorder.capture_user(raw)
            except Exception:
                pass

        if energy > 250:
            last_activity = now
        elif now - last_activity > 10:
            log("info", f"alive (no speech {now - last_activity:.0f}s)")
            last_activity = now

        try:
            await session.send_realtime_input(
                audio=types_mod.Blob(
                    data=send_data,
                    mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
                ),
            )
        except asyncio.CancelledError:
            return
        except Exception as e:
            log("warn", f"mic send failed: {e}")
            done.set()
            return

        await asyncio.sleep(frame_pause)


async def _send_frame_loop(session, types_mod, done: asyncio.Event) -> None:
    """Periodically push the latest cached camera frame (JPEG) to Gemini Live."""
    while not done.is_set() and not _STOP_REQUESTED.is_set():
        await asyncio.sleep(_FRAME_INTERVAL)
        with _LATEST_FRAME_LOCK:
            data = _LATEST_FRAME.get("bytes")
            ts = _LATEST_FRAME.get("ts", 0.0)
        if not data:
            continue
        if (time.time() - ts) > _FRAME_MAX_AGE:
            # Stale — don't waste tokens streaming a frame the camera abandoned.
            continue
        try:
            await session.send_realtime_input(
                video=types_mod.Blob(data=data, mime_type="image/jpeg"),
            )
        except asyncio.CancelledError:
            return
        except Exception as e:
            log("warn", f"frame send failed: {e}")
            # Keep going — frames are best-effort.


async def _receive_loop(session, speaker, recorder, sess: _Session, done: asyncio.Event) -> None:
    loop = asyncio.get_event_loop()
    last_recv = time.time()
    try:
        while not done.is_set() and not _STOP_REQUESTED.is_set():
            async for response in session.receive():
                last_recv = time.time()
                if done.is_set():
                    break

                if (hasattr(response, "go_away")
                        and getattr(response, "go_away", None) is not None):
                    emit({"type": "reconnect", "reason": "server go_away"})
                    done.set()
                    return

                sc = getattr(response, "server_content", None)
                if sc is None:
                    continue

                if getattr(sc, "interrupted", False) is True:
                    if sess.speaking:
                        log("info", "Gemini interrupted by server")
                        sess.speaking = False
                        sess.stream_started = False
                        try:
                            speaker.stop()
                        except Exception:
                            pass
                        try:
                            recorder.finish_turn()
                        except Exception:
                            pass
                    continue

                it = getattr(sc, "input_transcription", None)
                if it is not None:
                    text = (getattr(it, "text", "") or "").strip()
                    if text and not sess.speaking:
                        emit({"type": "user", "text": text})
                        try:
                            recorder.add_user_text(text)
                        except Exception:
                            pass

                ot = getattr(sc, "output_transcription", None)
                if ot is not None:
                    text = (getattr(ot, "text", "") or "").strip()
                    if text:
                        emit({"type": "bot", "text": text})
                        try:
                            recorder.add_robot_text(text)
                        except Exception:
                            pass

                mt = getattr(sc, "model_turn", None)
                if mt is not None:
                    for part in getattr(mt, "parts", []) or []:
                        inl = getattr(part, "inline_data", None)
                        if inl is not None and getattr(inl, "data", None):
                            now = time.time()
                            if not sess.speaking:
                                sess.ai_speak_start = now
                                sess.speaking = True
                            sess.last_ai_audio = now
                            raw_audio = inl.data
                            try:
                                recorder.capture_robot(raw_audio)
                            except Exception:
                                pass
                            audio_arr = np.frombuffer(raw_audio, dtype=np.int16)
                            if not sess.stream_started:
                                await loop.run_in_executor(None, speaker.begin_stream)
                                sess.stream_started = True
                            await loop.run_in_executor(
                                None, speaker.send_chunk,
                                audio_arr, RECEIVE_SAMPLE_RATE,
                            )

                if getattr(sc, "turn_complete", False):
                    if sess.speaking and sess.stream_started and not speaker.interrupted:
                        log("info", f"speaker {speaker.total_sent_sec:.1f}s")
                        await loop.run_in_executor(None, speaker.wait_finish)
                    elif sess.speaking and speaker.interrupted:
                        log("info", "speaker interrupted")
                    sess.speaking = False
                    sess.stream_started = False
                    try:
                        # Drop any echo of the just-played reply.
                        _MIC_HOLDER and _MIC_HOLDER[0].flush()
                    except Exception:
                        pass
                    try:
                        recorder.finish_turn()
                    except Exception:
                        pass
                    emit({"type": "turn_end"})

            if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
                log("warn", f"no messages from Gemini for {_NO_MESSAGES_TIMEOUT:.0f}s")
                break
            await asyncio.sleep(0.1)
    except asyncio.CancelledError:
        return
    except Exception as e:
        log("warn", f"receive ended: {e}")
    finally:
        done.set()


async def main_async() -> int:
    if not _API_KEY:
        log("error", "no Gemini API key (set MARCUS_GEMINI_API_KEY)")
        return 3

    try:
        from google import genai
        from google.genai import types
    except Exception as e:
        log("error", f"google-genai not importable: {e}")
        return 2

    try:
        client = genai.Client(api_key=_API_KEY)
    except Exception as e:
        log("error", f"failed to create Gemini client: {e}")
        return 4

    config = _build_config(types)

    mic = BuiltinMic()
    mic.start()
    _MIC_HOLDER.append(mic)

    speaker = _init_g1_speaker()
    if speaker is None:
        log("error", "G1 speaker not available — exiting")
        try:
            mic.stop()
        except Exception:
            pass
        return 5

    recorder = TurnRecorder(
        enabled=_REC_ENABLED,
        out_dir=_DATA_DIR,
        user_rate=SEND_SAMPLE_RATE,
        robot_rate=RECEIVE_SAMPLE_RATE,
    )

    session_num = 0
    consecutive_errors = 0
    start = time.time()

    try:
        while not _STOP_REQUESTED.is_set():
            session_num += 1
            uptime_min = (time.time() - start) / 60
            try:
                log("info", f"connecting (session #{session_num}, uptime {uptime_min:.0f}m)")
                async with client.aio.live.connect(model=_MODEL, config=config) as session:
                    emit({"type": "ready"})
                    consecutive_errors = 0
                    mic.flush()
                    sess = _Session()
                    done = asyncio.Event()
                    try:
                        await asyncio.wait_for(
                            asyncio.gather(
                                _send_mic_loop(session, types, mic, speaker, recorder, sess, done),
                                _send_frame_loop(session, types, done),
                                _receive_loop(session, speaker, recorder, sess, done),
                            ),
                            timeout=_SESSION_TIMEOUT,
                        )
                    except asyncio.TimeoutError:
                        log("info", f"session timed out after {_SESSION_TIMEOUT:.0f}s")
                    except asyncio.CancelledError:
                        pass

                    log("info", f"session #{session_num} ended — reconnecting in 1s")
                    try:
                        speaker.stop()
                    except Exception:
                        pass
                    try:
                        mic.flush()
                    except Exception:
                        pass
                    if _STOP_REQUESTED.is_set():
                        break
                    await asyncio.sleep(1)
            except asyncio.CancelledError:
                break
            except Exception as e:
                consecutive_errors += 1
                delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
                log("error", f"session error #{consecutive_errors}: {e} — retry in {delay:.0f}s")
                try:
                    await asyncio.sleep(delay)
                except asyncio.CancelledError:
                    break
                if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
                    log("warn", f"{consecutive_errors} consecutive errors — recreating client")
                    try:
                        client = genai.Client(api_key=_API_KEY)
                        consecutive_errors = 0
                    except Exception as ce:
                        log("error", f"client recreation failed: {ce}")
    finally:
        try:
            mic.stop()
        except Exception:
            pass
        try:
            speaker.stop()
        except Exception:
            pass

    return 0


def main() -> int:
    _install_signal_handlers()
    try:
        return asyncio.run(main_async())
    except KeyboardInterrupt:
        return 0
    except Exception as e:
        log("error", f"fatal: {e}")
        return 4


if __name__ == "__main__":
    sys.exit(main())