Marcus/Voice/gemini_script.py

"""Voice/gemini_script.py — subprocess manager for Gemini Live STT.

Runs in marcus's Python 3.8 env. The actual Gemini STT lives in
[Voice/gemini_runner.py](Voice/gemini_runner.py) which has to run in a
Python 3.10+ env (e.g. the `gemini_sdk` conda env on the Jetson) because
`google-genai` doesn't support Python 3.8.

This file spawns the runner as a subprocess, reads JSON-line transcripts
off its stdout, and turns them into the same `on_transcript` / `on_command`
callbacks the rest of marcus expects. The external API of class
`GeminiBrain` is unchanged from the previous in-process port — drop-in
swap for `Voice/marcus_voice.py::_voice_loop_gemini`.

Sanad uses the same subprocess pattern (its own `live_voice_loop.py`
parses log lines from a Gemini subprocess), so this matches Sanad's
architecture not just in mechanism but in shape.

────────────────────────────────────────────────────────────────────────
Subprocess lookup order for the Python 3.10+ binary:
    1. env  MARCUS_GEMINI_PYTHON                     (highest priority)
    2. config  stt.gemini_python_path
    3. auto-detect — try a list of common conda env paths
    4. raise — explicit error in voice.log
────────────────────────────────────────────────────────────────────────
"""

from __future__ import annotations

import base64
import json
import logging
import os
import subprocess
import sys
import threading
from typing import Callable, Optional, Union

log = logging.getLogger("gemini_brain")


# Candidate conda-env paths for the Python 3.10+ binary. Override with
# MARCUS_GEMINI_PYTHON or stt.gemini_python_path if the env lives elsewhere.
_DEFAULT_CANDIDATES = [
    "~/miniconda3/envs/gemini_sdk/bin/python",
    "~/anaconda3/envs/gemini_sdk/bin/python",
    "~/.miniconda3/envs/gemini_sdk/bin/python",
    "/opt/conda/envs/gemini_sdk/bin/python",
    "~/miniconda3/envs/sanad/bin/python",
    "~/anaconda3/envs/sanad/bin/python",
]


def _resolve_runner_python(stt_cfg: dict) -> str:
    """Find the Python 3.10+ binary that can import google-genai."""
    explicit = (
        os.environ.get("MARCUS_GEMINI_PYTHON")
        or stt_cfg.get("gemini_python_path", "")
    )
    if explicit:
        path = os.path.expanduser(explicit)
        if os.path.isfile(path) and os.access(path, os.X_OK):
            return path
        raise FileNotFoundError(
            "MARCUS_GEMINI_PYTHON / stt.gemini_python_path = "
            "{!r} but that binary does not exist or is not executable".format(path)
        )
    for cand in _DEFAULT_CANDIDATES:
        path = os.path.expanduser(cand)
        if os.path.isfile(path) and os.access(path, os.X_OK):
            log.info("auto-detected gemini-runner python at %s", path)
            return path
    raise FileNotFoundError(
        "no Python 3.10+ env found for the Gemini runner. Set env "
        "MARCUS_GEMINI_PYTHON to the path of a conda env's python with "
        "`google-genai` installed (e.g. ~/miniconda3/envs/gemini_sdk/bin/python)."
    )


class GeminiBrain:
    """Subprocess-managing wrapper around Voice/gemini_runner.py.

    External API kept identical to the in-process version so callers don't
    care that Gemini lives in another Python:

        brain = GeminiBrain(audio_io, recorder, voice_name, system_prompt,
                            api_key=..., on_transcript=cb1, on_command=cb2)
        brain.start()
        ...
        brain.stop()

    `audio_io` and `recorder` are accepted for API parity but unused —
    the subprocess owns its own mic and writes its own WAVs (one process
    owning the whole audio path is simpler than streaming PCM over a pipe).
    """

    def __init__(
        self,
        audio_io,                                   # ignored (runner owns its own)
        recorder,                                   # ignored (runner owns its own)
        voice_name=None,                            # forwarded via env
        system_prompt="",                           # forwarded via env (or config)
        *,
        api_key: str = "",
        on_transcript: Optional[Callable[[str], None]] = None,
        on_command: Optional[Callable[[str, str], None]] = None,
        on_bot_text: Optional[Callable[[str], None]] = None,
        on_turn_end: Optional[Callable[[], None]] = None,
    ):
        self._voice_name = voice_name or ""
        self._system_prompt = system_prompt or ""
        self._api_key = api_key
        self._on_transcript = on_transcript
        self._on_command = on_command
        self._on_bot_text = on_bot_text
        self._on_turn_end = on_turn_end

        self._proc = None                           # type: Optional[subprocess.Popen]
        self._reader_thread = None                  # type: Optional[threading.Thread]
        self._err_thread = None                     # type: Optional[threading.Thread]
        self._stopping = False
        self._stdin_lock = threading.Lock()         # serialise stdin writes

        # config-loaded lazily so import order doesn't matter
        try:
            from Core.config_loader import load_config
            cfg = load_config("Voice") or {}
        except Exception:
            cfg = {}
        self._stt = cfg.get("stt", {})

    # ─── lifecycle ────────────────────────────────────────

    def start(self) -> None:
        if self._proc is not None and self._proc.poll() is None:
            log.warning("GeminiBrain subprocess already running")
            return
        self._stopping = False

        try:
            python_bin = _resolve_runner_python(self._stt)
        except FileNotFoundError as e:
            log.error("%s", e)
            return

        runner = os.path.abspath(
            os.path.join(os.path.dirname(__file__), "gemini_runner.py")
        )
        if not os.path.isfile(runner):
            log.error("gemini_runner.py not found at %s", runner)
            return

        env = os.environ.copy()
        if self._api_key:
            env["MARCUS_GEMINI_API_KEY"] = self._api_key
        if self._voice_name:
            env["MARCUS_GEMINI_VOICE"] = self._voice_name
        # Forward the system prompt via env so the runner doesn't have to
        # re-read the JSON file (and so a trimmed inline string survives).
        if self._system_prompt:
            env["MARCUS_GEMINI_SYSTEM_PROMPT"] = self._system_prompt
        env["MARCUS_PROJECT_ROOT"] = os.path.dirname(os.path.dirname(runner))

        log.info("spawning gemini runner: %s -u %s", python_bin, runner)
        try:
            self._proc = subprocess.Popen(
                [python_bin, "-u", runner],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                cwd=os.path.dirname(os.path.dirname(runner)),
                env=env,
                bufsize=1,
                universal_newlines=True,
            )
        except Exception as e:
            log.error("failed to spawn gemini runner: %s", e)
            self._proc = None
            return

        self._reader_thread = threading.Thread(
            target=self._stdout_reader, daemon=True, name="gemini-stdout",
        )
        self._reader_thread.start()
        self._err_thread = threading.Thread(
            target=self._stderr_reader, daemon=True, name="gemini-stderr",
        )
        self._err_thread.start()

    def flush_mic(self) -> None:
        """Tell the runner to drop its mic buffer (echo prevention)."""
        self._send_stdin("flush\n")

    def send_frame(self, jpeg: Union[bytes, str]) -> None:
        """
        Forward a single camera frame (JPEG) to the runner so it can stream
        it to Gemini Live. The runner throttles + de-stales internally.

        `jpeg` may be raw bytes (e.g. from cv2.imencode) OR an already-
        base64 ASCII string (e.g. from API.camera_api.get_frame()).
        """
        proc = self._proc
        if proc is None or proc.stdin is None:
            return
        if isinstance(jpeg, bytes):
            b64 = base64.b64encode(jpeg).decode("ascii")
        elif isinstance(jpeg, str):
            b64 = jpeg.strip()
        else:
            return
        if not b64:
            return
        self._send_stdin("frame:" + b64 + "\n")

    def _send_stdin(self, line: str) -> None:
        """Serialised stdin write — multiple threads (frame sender + flush) can call safely."""
        proc = self._proc
        if proc is None or proc.stdin is None:
            return
        try:
            with self._stdin_lock:
                if not proc.stdin.closed:
                    proc.stdin.write(line)
                    proc.stdin.flush()
        except Exception:
            pass

    def stop(self) -> None:
        self._stopping = True
        proc = self._proc
        if proc is None:
            return
        # Polite stop: send "stop\n" on stdin, then wait briefly, then SIGTERM.
        self._send_stdin("stop\n")
        try:
            proc.wait(timeout=3)
        except Exception:
            try:
                proc.terminate()
            except Exception:
                pass
            try:
                proc.wait(timeout=2)
            except Exception:
                try:
                    proc.kill()
                except Exception:
                    pass
        self._proc = None

    # ─── stdout / stderr drainers ─────────────────────────

    def _stdout_reader(self) -> None:
        proc = self._proc
        if proc is None or proc.stdout is None:
            return
        for line in proc.stdout:
            if self._stopping:
                break
            line = line.strip()
            if not line:
                continue
            try:
                msg = json.loads(line)
            except Exception:
                # Non-JSON line — log it raw so we can debug runner crashes.
                log.warning("gemini-runner stdout (non-JSON): %s", line[:200])
                continue
            self._handle_msg(msg)

    def _stderr_reader(self) -> None:
        proc = self._proc
        if proc is None or proc.stderr is None:
            return
        for line in proc.stderr:
            line = line.rstrip()
            if line:
                log.warning("gemini-runner stderr: %s", line[:200])

    def _handle_msg(self, msg: dict) -> None:
        t = msg.get("type")
        if t == "user":
            text = (msg.get("text") or "").strip()
            if not text:
                return
            log.info("USER: %s", text)
            if self._on_transcript is not None:
                try:
                    self._on_transcript(text)
                except Exception as e:
                    log.error("on_transcript failed: %s", e)
            if self._on_command is not None:
                try:
                    self._on_command(text, "en")
                except Exception as e:
                    log.error("on_command failed: %s", e)
        elif t == "bot":
            txt = (msg.get("text") or "").strip()
            if txt:
                log.info("GEMINI: %s", txt[:120])
                if self._on_bot_text is not None:
                    try:
                        self._on_bot_text(txt)
                    except Exception as e:
                        log.error("on_bot_text failed: %s", e)
        elif t == "turn_end":
            log.info("listening")
            if self._on_turn_end is not None:
                try:
                    self._on_turn_end()
                except Exception as e:
                    log.error("on_turn_end failed: %s", e)
        elif t == "ready":
            log.info("connected — listening for speech")
        elif t == "reconnect":
            log.info("server signalled reconnect: %s", msg.get("reason", ""))
        elif t == "log":
            level = msg.get("level", "info")
            text = msg.get("msg", "")
            if level == "error":
                log.error("[runner] %s", text)
            elif level == "warn":
                log.warning("[runner] %s", text)
            else:
                log.info("[runner] %s", text)
        else:
            log.debug("gemini-runner unknown type=%r: %s", t, msg)