Sanadv3/gemini/subprocess.py

405 lines
16 KiB
Python

"""Gemini live subprocess supervisor.
Spawns `voice/sanad_voice.py` as a managed child with `SANAD_VOICE_BRAIN=gemini`,
tails the child's stdout, and extracts state transitions + user transcripts
from the Gemini-specific log lines emitted by `gemini/script.py:GeminiBrain`.
When a new model is added, build its own sibling supervisor (see
`voice/model_subprocess.py` for the template) — do not refactor this file.
"""
from __future__ import annotations
import base64
import json
import os
import signal
import subprocess
import sys
import threading
from collections import deque
from datetime import datetime
from typing import Any, Optional, Union
from pathlib import Path
from Project.Sanad.config import BASE_DIR, LOGS_DIR, SCRIPTS_DIR, LIVE_TUNE
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("gemini_subprocess")
_LS_CFG = _cfg_section("gemini", "subprocess")
# Camera frame forwarding — push the latest JPEG to the child over stdin
# at this interval (seconds). 0.5 s ≈ 2 fps, matching the child's
# SANAD_VISION_SEND_HZ default. The child de-stales + relays to Gemini.
_FRAME_FORWARD_INTERVAL_S = float(_LS_CFG.get("frame_forward_interval_sec", 0.5))
def _resolve_live_script() -> Path:
"""Locate the voice script to run as subprocess.
Default: voice/sanad_voice.py (the canonical G1 built-in mic +
AudioClient speaker path). Override with SANAD_LIVE_SCRIPT.
"""
override = os.environ.get("SANAD_LIVE_SCRIPT", "").strip()
if override:
p = Path(override).expanduser()
if p.exists():
return p
for c in (BASE_DIR / "voice" / "sanad_voice.py",
SCRIPTS_DIR / "sanad_voice.py"):
if c.exists():
return c
return SCRIPTS_DIR / "sanad_voice.py"
LIVE_SCRIPT = _resolve_live_script()
LOG_TAIL_SIZE = _LS_CFG.get("log_tail_size", 2000)
TRANSCRIPT_TAIL_SIZE = _LS_CFG.get("transcript_tail_size", 30)
# Persistent on-disk log for the full subprocess session.
LIVE_LOG_DIR = LOGS_DIR
LIVE_LOG_NAME = _LS_CFG.get("log_name", "gemini_subprocess")
_STOP_TIMEOUT_SEC = _LS_CFG.get("stop_timeout_sec", 3.0)
_TERMINATE_TIMEOUT_SEC = _LS_CFG.get("terminate_timeout_sec", 2.0)
_NOISY_PREFIXES = tuple(_LS_CFG.get("noisy_prefixes", [
"ALSA lib ", "Expression 'alsa_", "Cannot connect to server socket",
"jack server is not running",
]))
_NOISY_FRAGMENTS = tuple(_LS_CFG.get("noisy_fragments", [
"Unknown PCM", "Evaluate error", "snd_pcm_open_noupdate",
"PaAlsaStream", "snd_config_evaluate", "snd_func_refer",
]))
class GeminiSubprocess:
def __init__(self):
self._lock = threading.Lock()
self.process: subprocess.Popen | None = None
self.log_tail: deque[str] = deque(maxlen=LOG_TAIL_SIZE)
self.user_transcript: deque[str] = deque(maxlen=TRANSCRIPT_TAIL_SIZE)
self._reader_thread: threading.Thread | None = None
self._log_file = None # opened per-session in _reader_loop
self.state = "stopped"
self.state_message = "Idle."
self.last_user_text = ""
self.suppressed_noise = 0
# ── stdin push channel (camera frames + motion state) ──
# The child (gemini/script.py) reads "frame:<b64>\n" and
# "state:<json>\n" lines off its stdin. Writes are serialised
# because the frame forwarder and the motion-state bus handler
# both call from different threads.
self._stdin_lock = threading.Lock()
self._camera = None # set via attach_camera()
self._frame_thread: threading.Thread | None = None
self._frame_stop = threading.Event()
# ── camera attach (called once from main.py) ──────────────
def attach_camera(self, camera) -> None:
"""Give the supervisor a reference to the CameraDaemon so it can
forward frames to the child over stdin while a session runs."""
self._camera = camera
def _open_session_log(self, pid: int):
"""Open (or re-open) the per-day append log file for this session."""
try:
LIVE_LOG_DIR.mkdir(parents=True, exist_ok=True)
fname = f"{LIVE_LOG_NAME}_{datetime.now().strftime('%Y%m%d')}.log"
fh = open(LIVE_LOG_DIR / fname, "a", encoding="utf-8", buffering=1)
fh.write(
f"\n===== live_gemini subprocess start "
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} pid={pid} =====\n"
)
return fh
except Exception as exc:
log.warning("Could not open live-gemini log file: %s", exc)
return None
def _is_noisy(self, line: str) -> bool:
return line.startswith(_NOISY_PREFIXES) or any(f in line for f in _NOISY_FRAGMENTS)
def _set_state(self, state: str, msg: str):
self.state = state
self.state_message = msg
def _track_line(self, line: str):
"""Parse Gemini-specific log markers emitted by `gemini/script.py`.
Must stay in lock-step with the `log.info(...)` strings in
`GeminiBrain`. If you add a new state, add the emit in the brain
AND the matching detector here — in one PR.
"""
if "connecting to Gemini" in line:
self._set_state("connecting", line)
elif "connected — speak anytime" in line or "connected - speak anytime" in line:
self._set_state("listening", "Listening for speech.")
elif " USER: " in line or line.strip().startswith("USER:"):
# GeminiBrain emits: log.info("USER: %s", text)
text = line.split("USER:", 1)[1].strip()
if text:
self.last_user_text = text
self.user_transcript.append(text)
self._set_state("hearing", f"User: {text}")
elif "BARGE-IN" in line or "Gemini interrupted" in line or "interrupt (" in line:
self._set_state("interrupting", line)
elif "listening" in line.lower() and "no speech" not in line:
# Fires on "listening" (post-turn) — keep the state fresh.
self._set_state("listening", "Listening for speech.")
elif "session error" in line or "client recreation failed" in line:
self._set_state("error", line)
elif "server going away" in line or "session ended" in line or "session dead" in line:
self._set_state("warning", line)
elif "keyboard interrupt" in line or "cancelled — stopping" in line:
self._set_state("stopped", line)
def _reader_loop(self):
proc = self.process
if proc is None or proc.stdout is None:
return
# Every line goes to the on-disk log — including the ALSA noise
# that we filter out of the in-memory tail. That way a field
# post-mortem has the full raw capture if we need it.
fh = self._open_session_log(proc.pid)
self._log_file = fh
for line in proc.stdout:
clean = line.rstrip()
if not clean:
continue
if fh is not None:
try:
fh.write(clean + "\n")
except Exception:
pass
with self._lock:
if self._is_noisy(clean):
self.suppressed_noise += 1
continue
self.log_tail.append(clean)
self._track_line(clean)
with self._lock:
self.log_tail.append("Live Gemini process exited.")
self._set_state("stopped", "Process exited.")
if fh is not None:
try:
fh.write(
f"===== live_gemini subprocess exit "
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====\n"
)
fh.close()
except Exception:
pass
self._log_file = None
def is_running(self) -> bool:
with self._lock:
return self.process is not None and self.process.poll() is None
def start(self) -> dict[str, Any]:
with self._lock:
if self.process is not None and self.process.poll() is None:
return {"started": False, "message": "Already running.", "pid": self.process.pid}
self._set_state("starting", "Starting...")
script = LIVE_SCRIPT
if not script.exists():
raise RuntimeError(f"Script not found: {script}")
env = os.environ.copy()
env.update({"PYTHONUNBUFFERED": "1", **LIVE_TUNE})
# Pass the current G1 speaker volume as an env var so the
# subprocess can compute the correct barge-in threshold at
# startup. Without this, sanad_voice.py would read the volume
# from a stale or non-existent config file path and default to
# 100, scaling the barge-in threshold wrong for any non-100%
# volume. load_config() reads data/motions/config.json — the
# file the dashboard writes to when the user moves the slider.
try:
from Project.Sanad.config import load_config
_cfg = load_config() or {}
_audio_cfg = _cfg.get("audio") if isinstance(_cfg.get("audio"), dict) else {}
_g1_vol = int(_audio_cfg.get("g1_volume", 100))
_g1_vol = max(0, min(100, _g1_vol))
env["SANAD_G1_VOLUME"] = str(_g1_vol)
log.info("Passing SANAD_G1_VOLUME=%d to subprocess", _g1_vol)
except Exception as exc:
log.warning("Could not read g1_volume for subprocess: %s", exc)
# sanad_voice.py takes the DDS interface as the first positional arg
dds_iface = env.get("SANAD_DDS_INTERFACE", "eth0")
cmd = [sys.executable, str(script), dds_iface]
proc = subprocess.Popen(
cmd,
cwd=str(script.parent),
stdin=subprocess.PIPE, # camera frames + motion state push
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
env=env,
)
# Reap any stale frame forwarder from a previous session that ended
# by a child crash rather than a clean stop() — otherwise it keeps
# spinning on a dead pipe and we'd leak a thread per restart.
stale_ft = self._frame_thread
if stale_ft is not None and stale_ft.is_alive():
self._frame_stop.set()
stale_ft.join(timeout=2.0)
with self._lock:
self.process = proc
self.log_tail.append(f"Started: pid={proc.pid}")
self._set_state("starting", f"pid={proc.pid}")
self._reader_thread = threading.Thread(target=self._reader_loop, daemon=True)
self._reader_thread.start()
# Frame forwarder — pushes camera JPEGs to the child over stdin.
self._frame_stop.clear()
self._frame_thread = threading.Thread(
target=self._frame_forwarder, daemon=True, name="gemini-frame-fwd",
)
self._frame_thread.start()
log.info("Live Gemini subprocess started: pid=%d", proc.pid)
return {"started": True, "pid": proc.pid}
# ── stdin push channel ────────────────────────────────────
def _send_stdin(self, line: str) -> None:
"""Serialised stdin write — frame forwarder + motion-state handler
both call this from different threads. Best-effort: a closed pipe
or a not-yet-started process is a silent no-op."""
proc = self.process
if proc is None or proc.stdin is None:
return
try:
with self._stdin_lock:
if not proc.stdin.closed:
proc.stdin.write(line)
proc.stdin.flush()
except Exception:
# Pipe broke (child exited) — drop silently; the reader thread
# will surface the exit via state="stopped".
pass
def send_frame(self, jpeg: Union[bytes, str]) -> None:
"""Forward one camera frame to the child as 'frame:<base64>\\n'.
Accepts raw JPEG bytes (base64-encoded here) or an already-base64
ASCII string (e.g. CameraDaemon.get_frame_b64() — no re-encode)."""
if isinstance(jpeg, bytes):
b64 = base64.b64encode(jpeg).decode("ascii")
elif isinstance(jpeg, str):
b64 = jpeg.strip()
else:
return
if b64:
self._send_stdin("frame:" + b64 + "\n")
def send_state(self, event: str, cmd: str,
elapsed_sec: Optional[float] = None,
reason: Optional[str] = None) -> None:
"""Push a motion-state update to the child as 'state:<json>\\n'.
Events: start | complete | interrupted | error. The child injects
'[STATE-...] <cmd>' into the live Gemini session as silent text
context so Gemini can answer "what are you doing?" honestly."""
if not event or not cmd:
return
payload: dict[str, Any] = {"event": event, "cmd": cmd}
if elapsed_sec is not None:
payload["elapsed_sec"] = round(float(elapsed_sec), 2)
if reason:
payload["reason"] = str(reason)[:200]
try:
line = "state:" + json.dumps(payload, ensure_ascii=False) + "\n"
except Exception:
return
self._send_stdin(line)
def _frame_forwarder(self) -> None:
"""Background thread — push the camera's latest frame to the child.
Runs for the lifetime of one subprocess session. Gated on the
camera actually running; the child does its own vision-enabled +
staleness checks, so this stays dumb (camera up → push)."""
cam = self._camera
if cam is None:
return
while not self._frame_stop.is_set():
if self._frame_stop.wait(_FRAME_FORWARD_INTERVAL_S):
break
try:
if not cam.is_running():
continue
b64 = cam.get_frame_b64()
if b64:
self.send_frame(b64)
except Exception:
# Best-effort — never let a frame hiccup kill the thread.
pass
def stop(self) -> dict[str, Any]:
with self._lock:
proc = self.process
if proc is None or proc.poll() is not None:
return {"stopped": False, "message": "Not running."}
self._set_state("stopping", "Stopping...")
# Halt the frame forwarder before we tear the pipe down.
self._frame_stop.set()
ft = self._frame_thread
if ft is not None:
ft.join(timeout=2.0)
self._frame_thread = None
try:
proc.send_signal(signal.SIGINT)
proc.wait(timeout=_STOP_TIMEOUT_SEC)
except subprocess.TimeoutExpired:
proc.terminate()
try:
proc.wait(timeout=_TERMINATE_TIMEOUT_SEC)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait(timeout=_TERMINATE_TIMEOUT_SEC)
rc = proc.returncode
# Close stdin/stdout explicitly — without this each start/stop
# cycle leaks FDs (relied on Popen.__del__ which only runs at GC;
# a reconnect loop would march the FD count to the OS limit).
for pipe in (getattr(proc, "stdin", None), getattr(proc, "stdout", None)):
if pipe is not None:
try:
pipe.close()
except Exception:
pass
with self._lock:
self.process = None
self.log_tail.append("Stopped.")
self._set_state("stopped", "Stopped.")
log.info("Live Gemini subprocess stopped (rc=%s)", rc)
return {"stopped": True, "returncode": rc}
def status(self) -> dict[str, Any]:
with self._lock:
running = self.process is not None and self.process.poll() is None
return {
"running": running,
"pid": self.process.pid if running and self.process else None,
"state": self.state,
"state_message": self.state_message,
"last_user_text": self.last_user_text,
"user_transcript": list(self.user_transcript),
"log_tail": list(self.log_tail),
"suppressed_noise": self.suppressed_noise,
}