792 lines
34 KiB
Python
792 lines
34 KiB
Python
"""Gemini brain — live conversation loop using the google-genai SDK.
|
|
|
|
Implements the VoiceBrain contract documented in `voice/model_script.py`:
|
|
|
|
__init__(audio_io, recorder, voice_name, system_prompt)
|
|
async run()
|
|
stop()
|
|
|
|
Owns everything Gemini-specific: the `genai.Client`, `LiveConnectConfig`,
|
|
the session connect/receive loop, VAD-based barge-in, echo suppression,
|
|
reconnect backoff. Hardware I/O is delegated to `audio_io` and per-turn
|
|
WAV capture to `recorder` — both are model-agnostic.
|
|
|
|
Env overrides:
|
|
SANAD_GEMINI_MODEL — Gemini Live model id (without "models/" prefix)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import array
|
|
import asyncio
|
|
import base64
|
|
import json
|
|
import os
|
|
import sys
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import numpy as np
|
|
|
|
from google import genai
|
|
from google.genai import types
|
|
|
|
from Project.Sanad.config import (
|
|
BASE_DIR,
|
|
CHUNK_SIZE,
|
|
GEMINI_API_KEY,
|
|
GEMINI_VOICE,
|
|
RECEIVE_SAMPLE_RATE,
|
|
SEND_SAMPLE_RATE,
|
|
)
|
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
|
from Project.Sanad.core.logger import get_logger
|
|
from Project.Sanad.vision import recognition_state as _recog_state
|
|
|
|
log = get_logger("gemini_brain")
|
|
|
|
_SV = _cfg_section("voice", "sanad_voice")
|
|
_VAD = _cfg_section("voice", "vad")
|
|
_BI = _cfg_section("voice", "barge_in")
|
|
|
|
_MODEL = os.environ.get(
|
|
"SANAD_GEMINI_MODEL",
|
|
"gemini-2.5-flash-native-audio-preview-12-2025",
|
|
)
|
|
_MIC_GAIN = _SV.get("mic_gain", 1.0)
|
|
_SESSION_TIMEOUT = _SV.get("session_timeout_sec", 660)
|
|
_MAX_RECONNECT_DELAY = _SV.get("max_reconnect_delay_sec", 30)
|
|
_MAX_CONSECUTIVE_ERRORS = _SV.get("max_consecutive_errors", 10)
|
|
_NO_MESSAGES_TIMEOUT = _SV.get("no_messages_timeout_sec", 30)
|
|
|
|
_CHUNK_BYTES = CHUNK_SIZE * 2
|
|
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES
|
|
|
|
# ── Recognition (camera + face gallery) tunables ──
|
|
_RECOG_STATE_PATH = Path(os.environ.get(
|
|
"SANAD_RECOGNITION_STATE_PATH",
|
|
str(BASE_DIR / "data" / ".recognition_state.json"),
|
|
))
|
|
_VISION_SEND_HZ = float(os.environ.get("SANAD_VISION_SEND_HZ", "2"))
|
|
_VISION_STALE_MS = int(os.environ.get("SANAD_VISION_STALE_MS", "1500"))
|
|
_RECOG_POLL_S = float(os.environ.get("SANAD_RECOGNITION_POLL_S", "1.0"))
|
|
_FACES_DIR = Path(os.environ.get(
|
|
"SANAD_FACES_DIR",
|
|
str(BASE_DIR / "data" / "faces"),
|
|
))
|
|
_FACES_MAX_SAMPLES = int(os.environ.get("SANAD_FACES_MAX_SAMPLES", "3"))
|
|
_FACES_PRIMER_RESIZE = int(os.environ.get("SANAD_FACES_PRIMER_RESIZE", "256"))
|
|
|
|
|
|
# ── stdin push channel (Marcus pattern) ──────────────────────
|
|
# The GeminiSubprocess supervisor writes two line types to this process's
|
|
# stdin:
|
|
# "frame:<base64-jpeg>\n" — a camera frame to relay to Gemini Live
|
|
# "state:<json>\n" — a motion-state update to inject as text
|
|
# A daemon thread parses them into the caches below; the asyncio tasks
|
|
# _send_frame_loop / _send_state_loop drain those caches.
|
|
|
|
_LATEST_FRAME_LOCK = threading.Lock()
|
|
_LATEST_FRAME: dict = {"bytes": None, "ts": 0.0}
|
|
|
|
_STATE_LOCK = threading.Lock()
|
|
_STATE_PENDING: list[str] = []
|
|
|
|
_STATE_TAGS = {
|
|
"start": "[STATE-START]",
|
|
"complete": "[STATE-DONE]",
|
|
"interrupted": "[STATE-INTERRUPTED]",
|
|
"error": "[STATE-ERROR]",
|
|
"paused": "[STATE-PAUSED]",
|
|
"resumed": "[STATE-RESUMED]",
|
|
}
|
|
|
|
|
|
def _stdin_watcher() -> None:
|
|
"""Daemon thread — parse 'frame:' / 'state:' lines off stdin.
|
|
|
|
Best-effort: any malformed line is skipped. Exits when the parent
|
|
closes our stdin (subprocess teardown)."""
|
|
try:
|
|
for line in sys.stdin:
|
|
line = line.rstrip("\n")
|
|
if not line:
|
|
continue
|
|
if line.startswith("frame:"):
|
|
b64 = line[len("frame:"):]
|
|
try:
|
|
data = base64.b64decode(b64)
|
|
except Exception:
|
|
continue
|
|
if data:
|
|
with _LATEST_FRAME_LOCK:
|
|
_LATEST_FRAME["bytes"] = data
|
|
_LATEST_FRAME["ts"] = time.time()
|
|
elif line.startswith("state:"):
|
|
try:
|
|
payload = json.loads(line[len("state:"):])
|
|
except Exception:
|
|
continue
|
|
event = (payload.get("event") or "").strip().lower()
|
|
cmd = (payload.get("cmd") or "").strip()
|
|
tag = _STATE_TAGS.get(event)
|
|
if not tag or not cmd:
|
|
continue
|
|
msg = f"{tag} {cmd}"
|
|
elapsed = payload.get("elapsed_sec")
|
|
if isinstance(elapsed, (int, float)):
|
|
msg += f" ({float(elapsed):.1f}s)"
|
|
reason = payload.get("reason")
|
|
if reason and event == "error":
|
|
msg += f" — {reason}"
|
|
with _STATE_LOCK:
|
|
_STATE_PENDING.append(msg)
|
|
except Exception:
|
|
return
|
|
|
|
|
|
# Start the watcher at import time — it blocks harmlessly on sys.stdin
|
|
# until the supervisor sends something. Daemon so it never blocks exit.
|
|
threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()
|
|
|
|
|
|
def _audio_energy(pcm: bytes) -> int:
|
|
try:
|
|
samples = array.array("h", pcm)
|
|
return sum(abs(s) for s in samples) // len(samples) if samples else 0
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
class GeminiBrain:
|
|
"""Gemini Live conversation brain — reconnect-safe."""
|
|
|
|
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
|
|
system_prompt: str = ""):
|
|
self._audio = audio_io
|
|
self._mic = audio_io.mic
|
|
self._speaker = audio_io.speaker
|
|
self._recorder = recorder
|
|
self._voice = voice_name or GEMINI_VOICE
|
|
self._system_prompt = system_prompt
|
|
self._api_key = GEMINI_API_KEY
|
|
self._stop_flag = asyncio.Event()
|
|
# per-session state (reset in the outer reconnect loop)
|
|
self._speaking = False
|
|
self._stream_started = False
|
|
self._barge_block_until = 0.0
|
|
self._ai_speak_start = 0.0
|
|
self._last_ai_audio = 0.0
|
|
self._done: Optional[asyncio.Event] = None
|
|
# ── Recognition flags — kept in sync with the state file by
|
|
# _recognition_state_watcher. Boot defaults come from the file (or
|
|
# the SANAD_* env vars if the file is missing).
|
|
_initial = _recog_state.read(_RECOG_STATE_PATH)
|
|
self._vision_enabled = bool(
|
|
_initial.vision_enabled
|
|
or os.environ.get("SANAD_VISION_ENABLE", "0") == "1"
|
|
)
|
|
self._face_rec_enabled = bool(
|
|
_initial.face_rec_enabled
|
|
or os.environ.get("SANAD_FACE_RECOGNITION_ENABLE", "0") == "1"
|
|
)
|
|
self._gallery_version_primed = -1 # bumped after first successful primer
|
|
|
|
def stop(self) -> None:
|
|
"""Signal the run loop to exit at the next opportunity."""
|
|
try:
|
|
self._stop_flag.set()
|
|
except Exception:
|
|
pass
|
|
|
|
# ─── public entry point ───────────────────────────────
|
|
|
|
async def run(self) -> None:
|
|
client = genai.Client(api_key=self._api_key)
|
|
config = self._build_config()
|
|
session_num = 0
|
|
start_time = time.time()
|
|
consecutive_errors = 0
|
|
|
|
while not self._stop_flag.is_set():
|
|
session_num += 1
|
|
self._reset_turn_state()
|
|
uptime_min = (time.time() - start_time) / 60
|
|
|
|
try:
|
|
log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
|
|
session_num, uptime_min)
|
|
async with client.aio.live.connect(model=_MODEL, config=config) as session:
|
|
log.info("connected — speak anytime!")
|
|
consecutive_errors = 0
|
|
self._mic.flush()
|
|
self._done = asyncio.Event()
|
|
# Reset per-session primer state so re-priming on reconnect
|
|
# actually happens. The state watcher will re-prime as soon
|
|
# as it sees vision+face-rec enabled.
|
|
self._gallery_version_primed = -1
|
|
|
|
try:
|
|
await asyncio.wait_for(
|
|
asyncio.gather(
|
|
self._send_mic_loop(session),
|
|
self._receive_loop(session),
|
|
self._send_frame_loop(session),
|
|
self._send_state_loop(session),
|
|
self._recognition_state_watcher(session),
|
|
),
|
|
timeout=_SESSION_TIMEOUT,
|
|
)
|
|
except asyncio.TimeoutError:
|
|
log.warning("session timed out after %ds", _SESSION_TIMEOUT)
|
|
except asyncio.CancelledError:
|
|
log.warning("session cancelled")
|
|
|
|
log.info("session #%d ended — reconnecting in 1s", session_num)
|
|
self._speaker.stop()
|
|
self._mic.flush()
|
|
await asyncio.sleep(1)
|
|
|
|
except asyncio.CancelledError:
|
|
log.info("cancelled — stopping")
|
|
break
|
|
except KeyboardInterrupt:
|
|
log.info("keyboard interrupt — stopping")
|
|
break
|
|
except Exception as exc:
|
|
consecutive_errors += 1
|
|
delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
|
|
log.error("session error (#%d): %s — reconnecting in %ds",
|
|
consecutive_errors, exc, delay)
|
|
await asyncio.sleep(delay)
|
|
if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
|
|
log.warning("%d consecutive errors — recreating client",
|
|
consecutive_errors)
|
|
try:
|
|
client = genai.Client(api_key=self._api_key)
|
|
consecutive_errors = 0
|
|
except Exception as ce:
|
|
log.error("client recreation failed: %s", ce)
|
|
|
|
# ─── Gemini config ────────────────────────────────────
|
|
|
|
def _build_config(self) -> types.LiveConnectConfig:
|
|
return types.LiveConnectConfig(
|
|
response_modalities=["AUDIO"],
|
|
speech_config=types.SpeechConfig(
|
|
voice_config=types.VoiceConfig(
|
|
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
|
voice_name=self._voice,
|
|
),
|
|
),
|
|
),
|
|
realtime_input_config=types.RealtimeInputConfig(
|
|
automatic_activity_detection=types.AutomaticActivityDetection(
|
|
disabled=False,
|
|
start_of_speech_sensitivity=getattr(
|
|
types.StartSensitivity,
|
|
_VAD.get("start_sensitivity", "START_SENSITIVITY_HIGH"),
|
|
),
|
|
end_of_speech_sensitivity=getattr(
|
|
types.EndSensitivity,
|
|
_VAD.get("end_sensitivity", "END_SENSITIVITY_LOW"),
|
|
),
|
|
prefix_padding_ms=_VAD.get("prefix_padding_ms", 20),
|
|
silence_duration_ms=_VAD.get("silence_duration_ms", 200),
|
|
),
|
|
),
|
|
input_audio_transcription=types.AudioTranscriptionConfig(),
|
|
output_audio_transcription=types.AudioTranscriptionConfig(),
|
|
system_instruction=types.Content(
|
|
parts=[types.Part(text=self._system_prompt)],
|
|
),
|
|
)
|
|
|
|
# ─── state helpers ────────────────────────────────────
|
|
|
|
def _reset_turn_state(self) -> None:
|
|
self._speaking = False
|
|
self._stream_started = False
|
|
self._barge_block_until = 0.0
|
|
self._ai_speak_start = 0.0
|
|
self._last_ai_audio = 0.0
|
|
|
|
def _interrupt(self, source: str = "local") -> None:
|
|
self._speaking = False
|
|
self._stream_started = False
|
|
self._speaker.stop()
|
|
self._mic.flush()
|
|
self._recorder.finish_turn()
|
|
log.info("interrupt (%s)", source)
|
|
|
|
# ─── mic send loop ────────────────────────────────────
|
|
|
|
async def _send_mic_loop(self, session: Any) -> None:
|
|
threshold = _BI.get("threshold", 500)
|
|
chunks_needed = _BI.get("loud_chunks_needed", 3)
|
|
cooldown = _BI.get("cooldown_sec", 0.3)
|
|
echo_suppress_below = _BI.get("echo_suppress_below", 500)
|
|
grace = _BI.get("ai_speak_grace_sec", 0.15)
|
|
|
|
loop = asyncio.get_event_loop()
|
|
loud_count = 0
|
|
last_activity = time.time()
|
|
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
try:
|
|
raw = await loop.run_in_executor(
|
|
None, self._mic.read_chunk, _CHUNK_BYTES,
|
|
)
|
|
except Exception:
|
|
break
|
|
|
|
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
|
|
samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
|
|
data = samples.tobytes()
|
|
energy = _audio_energy(data)
|
|
now = time.time()
|
|
|
|
# Barge-in: after AI starts speaking, sustained user energy cuts it.
|
|
if self._speaking and now >= self._barge_block_until:
|
|
if (now - self._ai_speak_start) >= grace:
|
|
if energy > threshold:
|
|
loud_count += 1
|
|
else:
|
|
loud_count = max(0, loud_count - 1)
|
|
if loud_count > chunks_needed:
|
|
log.info("BARGE-IN (e=%d)", energy)
|
|
self._interrupt("barge-in")
|
|
loud_count = 0
|
|
self._barge_block_until = now + cooldown
|
|
|
|
# Echo suppression: while AI is speaking, mask quiet frames so the
|
|
# mic doesn't feed the model its own voice bleed.
|
|
send_data = data
|
|
if self._speaking and energy < echo_suppress_below:
|
|
send_data = _SILENCE_PCM
|
|
|
|
# Record user audio when clearly speaking and AI isn't.
|
|
if energy > 250 and not self._speaking:
|
|
self._recorder.capture_user(data)
|
|
|
|
# Keep-alive watchdog
|
|
if energy > 250:
|
|
last_activity = now
|
|
elif now - last_activity > 10:
|
|
log.info("alive (no speech %.0fs, e=%d)",
|
|
now - last_activity, energy)
|
|
last_activity = now
|
|
|
|
try:
|
|
await session.send_realtime_input(
|
|
audio=types.Blob(
|
|
data=send_data,
|
|
mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
|
|
),
|
|
)
|
|
except asyncio.CancelledError:
|
|
return
|
|
except Exception as exc:
|
|
log.warning("mic send failed: %s — ending session", exc)
|
|
self._done.set()
|
|
return
|
|
|
|
await asyncio.sleep(CHUNK_SIZE / SEND_SAMPLE_RATE)
|
|
|
|
log.info("send_mic task ended")
|
|
|
|
# ─── receive loop ─────────────────────────────────────
|
|
|
|
async def _receive_loop(self, session: Any) -> None:
|
|
loop = asyncio.get_event_loop()
|
|
try:
|
|
last_recv = time.time()
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
async for response in session.receive():
|
|
last_recv = time.time()
|
|
if self._done.is_set():
|
|
break
|
|
|
|
if hasattr(response, "go_away") and response.go_away is not None:
|
|
log.info("server going away — will reconnect")
|
|
self._done.set()
|
|
return
|
|
|
|
sc = response.server_content
|
|
if sc is None:
|
|
continue
|
|
|
|
if sc.interrupted is True:
|
|
if self._speaking:
|
|
log.info("Gemini interrupted")
|
|
self._interrupt("gemini")
|
|
continue
|
|
|
|
if sc.input_transcription:
|
|
text = (sc.input_transcription.text or "").strip()
|
|
if text and not self._speaking:
|
|
log.info("USER: %s", text)
|
|
self._recorder.add_user_text(text)
|
|
|
|
if sc.output_transcription:
|
|
text = (sc.output_transcription.text or "").strip()
|
|
if text:
|
|
log.info("BOT : %s", text)
|
|
self._recorder.add_robot_text(text)
|
|
|
|
if sc.model_turn:
|
|
for part in sc.model_turn.parts:
|
|
if part.inline_data and part.inline_data.data:
|
|
now = time.time()
|
|
if not self._speaking:
|
|
self._ai_speak_start = now
|
|
self._speaking = True
|
|
self._last_ai_audio = now
|
|
raw_audio = part.inline_data.data
|
|
self._recorder.capture_robot(raw_audio)
|
|
audio = np.frombuffer(raw_audio, dtype=np.int16)
|
|
if not self._stream_started:
|
|
await loop.run_in_executor(
|
|
None, self._speaker.begin_stream,
|
|
)
|
|
self._stream_started = True
|
|
await loop.run_in_executor(
|
|
None, self._speaker.send_chunk,
|
|
audio, RECEIVE_SAMPLE_RATE,
|
|
)
|
|
|
|
if sc.turn_complete:
|
|
if (self._speaking and self._stream_started
|
|
and not self._speaker.interrupted):
|
|
log.info("speaker %.1fs", self._speaker.total_sent_sec)
|
|
await loop.run_in_executor(
|
|
None, self._speaker.wait_finish,
|
|
)
|
|
elif self._speaking and self._speaker.interrupted:
|
|
log.info("speaker interrupted")
|
|
self._speaking = False
|
|
self._stream_started = False
|
|
self._mic.flush()
|
|
self._recorder.finish_turn()
|
|
log.info("listening")
|
|
|
|
if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
|
|
log.warning("no messages from Gemini for %ds — session dead",
|
|
_NO_MESSAGES_TIMEOUT)
|
|
break
|
|
await asyncio.sleep(0.1)
|
|
|
|
except Exception as exc:
|
|
log.warning("receive ended: %s", exc)
|
|
finally:
|
|
self._done.set()
|
|
|
|
# ─── vision-state announcer ───────────────────────────
|
|
# Injects the camera state into the live session as text context.
|
|
# On a live toggle Gemini is told to say so out loud ("I can see you
|
|
# now" / "I can't see you anymore"); at session start it's silent
|
|
# standing context so "can you see me?" is answered honestly.
|
|
|
|
async def _announce_vision_state(self, session: Any, enabled: bool,
|
|
is_toggle: bool) -> None:
|
|
if is_toggle and enabled:
|
|
text = (
|
|
"[VISION ON] Your camera was just enabled — you can now see "
|
|
"the user through it. Briefly tell them you can see them now, "
|
|
"in your normal Khaleeji style (for example: "
|
|
"'هلا، الحين أشوفك زين')."
|
|
)
|
|
elif is_toggle and not enabled:
|
|
text = (
|
|
"[VISION OFF] Your camera was just disabled — you can no "
|
|
"longer see anything. Briefly tell the user you can't see "
|
|
"them anymore. If they later ask whether you can see them, "
|
|
"tell them to enable the camera from the dashboard."
|
|
)
|
|
elif enabled: # session start, camera already on
|
|
text = (
|
|
"[VISION STATUS] Your camera is ON — you can see the user "
|
|
"through it. Do not announce this unprompted; just answer "
|
|
"naturally if they ask what you see."
|
|
)
|
|
else: # session start, camera off
|
|
text = (
|
|
"[VISION STATUS] Your camera is OFF — you cannot see anything "
|
|
"right now. If the user asks whether you can see them, tell "
|
|
"them to enable the camera from the dashboard. Do not announce "
|
|
"this unprompted."
|
|
)
|
|
try:
|
|
await session.send_realtime_input(text=text)
|
|
log.info("vision-state injected (enabled=%s, toggle=%s)",
|
|
enabled, is_toggle)
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("vision-state inject failed: %s", exc)
|
|
|
|
# ─── face-recognition-state announcer ─────────────────
|
|
# Same idea as _announce_vision_state, for the face-recognition toggle.
|
|
# On a live OFF toggle it also tells Gemini to disregard the gallery —
|
|
# so OFF takes effect immediately instead of lingering until reconnect.
|
|
|
|
async def _announce_facerec_state(self, session: Any, enabled: bool,
|
|
is_toggle: bool) -> None:
|
|
if is_toggle and enabled:
|
|
text = (
|
|
"[FACE RECOGNITION ON] Face recognition was just enabled — "
|
|
"you'll be shown the people you know in a moment. Briefly "
|
|
"tell the user you can now recognise the people you know, in "
|
|
"your normal Khaleeji style."
|
|
)
|
|
elif is_toggle and not enabled:
|
|
text = (
|
|
"[FACE RECOGNITION OFF] Face recognition was just disabled. "
|
|
"Disregard the face gallery you were given earlier — stop "
|
|
"greeting people by name and do not identify anyone. Briefly "
|
|
"tell the user you'll no longer recognise faces."
|
|
)
|
|
elif enabled: # session start, face rec already on
|
|
text = (
|
|
"[FACE RECOGNITION STATUS] Face recognition is ON — when you "
|
|
"see someone you've been shown in the gallery, greet them by "
|
|
"name. Do not announce this unprompted."
|
|
)
|
|
else: # session start, face rec off
|
|
text = (
|
|
"[FACE RECOGNITION STATUS] Face recognition is OFF — you "
|
|
"cannot identify people. If the user asks who someone is or "
|
|
"whether you recognise them, tell them to enable face "
|
|
"recognition from the dashboard. Do not announce this "
|
|
"unprompted."
|
|
)
|
|
try:
|
|
await session.send_realtime_input(text=text)
|
|
log.info("face-rec-state injected (enabled=%s, toggle=%s)",
|
|
enabled, is_toggle)
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("face-rec-state inject failed: %s", exc)
|
|
|
|
# ─── recognition state watcher ────────────────────────
|
|
# Polls data/.recognition_state.json at SANAD_RECOGNITION_POLL_S Hz and
|
|
# mirrors vision_enabled / face_rec_enabled into in-memory flags so the
|
|
# rest of the session can react WITHOUT a Gemini reconnect.
|
|
|
|
async def _recognition_state_watcher(self, session: Any) -> None:
|
|
last_mtime = 0.0
|
|
last_state = _recog_state.RecognitionState(
|
|
vision_enabled=self._vision_enabled,
|
|
face_rec_enabled=self._face_rec_enabled,
|
|
gallery_version=self._gallery_version_primed,
|
|
)
|
|
# Best-effort initial primer if face_rec is already on at session start.
|
|
if self._face_rec_enabled and self._vision_enabled:
|
|
try:
|
|
cur = _recog_state.read(_RECOG_STATE_PATH)
|
|
await self._send_gallery_primer(session, cur.gallery_version)
|
|
except Exception as exc:
|
|
log.warning("initial gallery primer failed: %s", exc)
|
|
|
|
# Tell Gemini the current camera + face-recognition state at session
|
|
# start — silent standing context so "can you see me?" / "do you know
|
|
# who I am?" are answered honestly even if the user never toggles.
|
|
await self._announce_vision_state(
|
|
session, self._vision_enabled, is_toggle=False,
|
|
)
|
|
await self._announce_facerec_state(
|
|
session, self._face_rec_enabled, is_toggle=False,
|
|
)
|
|
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
await asyncio.sleep(_RECOG_POLL_S)
|
|
try:
|
|
st = _RECOG_STATE_PATH.stat()
|
|
except FileNotFoundError:
|
|
continue
|
|
except Exception:
|
|
continue
|
|
if st.st_mtime == last_mtime:
|
|
continue
|
|
last_mtime = st.st_mtime
|
|
new_state = _recog_state.read(_RECOG_STATE_PATH)
|
|
|
|
# Vision toggle — instant. Announce it out loud so Gemini reacts
|
|
# ("I can see you now" / "I can't see you anymore").
|
|
if new_state.vision_enabled != last_state.vision_enabled:
|
|
self._vision_enabled = new_state.vision_enabled
|
|
log.info("vision toggled → %s", self._vision_enabled)
|
|
await self._announce_vision_state(
|
|
session, self._vision_enabled, is_toggle=True,
|
|
)
|
|
|
|
# Face-rec toggle — announce it out loud. The OFF announcement
|
|
# also tells Gemini to disregard the gallery, so OFF takes effect
|
|
# immediately instead of lingering until the next reconnect.
|
|
if new_state.face_rec_enabled != last_state.face_rec_enabled:
|
|
self._face_rec_enabled = new_state.face_rec_enabled
|
|
if self._face_rec_enabled:
|
|
log.info("face rec enabled — announcing + sending primer")
|
|
else:
|
|
log.info("face rec disabled — telling Gemini to "
|
|
"disregard the gallery")
|
|
await self._announce_facerec_state(
|
|
session, self._face_rec_enabled, is_toggle=True,
|
|
)
|
|
|
|
# Conditions for re-priming:
|
|
# - face_rec just turned ON (no_face_rec_before)
|
|
# - gallery version bumped since the last primer
|
|
face_rec_just_on = (
|
|
new_state.face_rec_enabled and not last_state.face_rec_enabled
|
|
)
|
|
gallery_changed = (
|
|
new_state.gallery_version != self._gallery_version_primed
|
|
)
|
|
if (self._face_rec_enabled
|
|
and (face_rec_just_on or gallery_changed)
|
|
and self._vision_enabled):
|
|
try:
|
|
await self._send_gallery_primer(
|
|
session, new_state.gallery_version,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("gallery primer failed: %s", exc)
|
|
|
|
last_state = new_state
|
|
|
|
# ─── camera frame send loop ───────────────────────────
|
|
# Reads the latest JPEG from the _LATEST_FRAME cache (fed by the
|
|
# _stdin_watcher thread, which the GeminiSubprocess supervisor pushes
|
|
# 'frame:<b64>' lines into) and relays it to Gemini Live at
|
|
# _VISION_SEND_HZ. Only active when self._vision_enabled. Skips frames
|
|
# older than _VISION_STALE_MS so a stopped/unplugged camera doesn't
|
|
# waste tokens on a frozen scene.
|
|
|
|
async def _send_frame_loop(self, session: Any) -> None:
|
|
period = 1.0 / max(0.5, _VISION_SEND_HZ)
|
|
stale_s = _VISION_STALE_MS / 1000.0
|
|
backoff = 0.0
|
|
last_sent_ts = 0.0
|
|
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
await asyncio.sleep(max(period, backoff))
|
|
if not self._vision_enabled:
|
|
continue
|
|
with _LATEST_FRAME_LOCK:
|
|
data = _LATEST_FRAME.get("bytes")
|
|
ts = _LATEST_FRAME.get("ts", 0.0)
|
|
if not data:
|
|
continue
|
|
# Stale — supervisor stopped pushing (camera off / unplugged).
|
|
if (time.time() - ts) > stale_s:
|
|
continue
|
|
# De-dup — don't re-send a frame we already relayed.
|
|
if ts == last_sent_ts:
|
|
continue
|
|
try:
|
|
await session.send_realtime_input(
|
|
video=types.Blob(data=data, mime_type="image/jpeg"),
|
|
)
|
|
last_sent_ts = ts
|
|
backoff = 0.0
|
|
except asyncio.CancelledError:
|
|
return
|
|
except Exception as exc:
|
|
log.warning("frame send failed: %s", exc)
|
|
backoff = min(backoff * 2 + 0.5, 5.0)
|
|
|
|
# ─── motion-state inject loop ─────────────────────────
|
|
# Drains _STATE_PENDING (fed by the _stdin_watcher from 'state:' lines
|
|
# the supervisor pushes when the arm starts/finishes/errors a motion)
|
|
# and injects each as silent text context into the live session, so
|
|
# Gemini can answer "what are you doing?" honestly. Per persona, Gemini
|
|
# reads these for context but does not narrate them unprompted.
|
|
|
|
async def _send_state_loop(self, session: Any) -> None:
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
await asyncio.sleep(0.1)
|
|
with _STATE_LOCK:
|
|
if not _STATE_PENDING:
|
|
continue
|
|
pending = list(_STATE_PENDING)
|
|
_STATE_PENDING.clear()
|
|
for msg in pending:
|
|
try:
|
|
await session.send_realtime_input(text=msg)
|
|
log.info("STATE injected: %s", msg)
|
|
except asyncio.CancelledError:
|
|
return
|
|
except Exception as exc:
|
|
# Some SDK versions may not accept text on
|
|
# send_realtime_input — log once-ish and keep going;
|
|
# motion still works, only this context channel is lost.
|
|
log.warning("state inject failed: %s", exc)
|
|
|
|
# ─── face gallery primer ──────────────────────────────
|
|
# Builds one multimodal turn carrying the entire face gallery + a Khaleeji
|
|
# greeting instruction, and sends it via send_client_content. Gemini keeps
|
|
# this in session context until reconnect. Re-sent on gallery_version bumps.
|
|
|
|
async def _send_gallery_primer(self, session: Any, version: int) -> None:
|
|
try:
|
|
from Project.Sanad.vision.face_gallery import FaceGallery
|
|
except Exception as exc:
|
|
log.info("face gallery module unavailable: %s", exc)
|
|
return
|
|
|
|
gallery = FaceGallery(_FACES_DIR)
|
|
try:
|
|
entries = gallery.load_for_primer(
|
|
max_samples_per_face=_FACES_MAX_SAMPLES,
|
|
resize_long_side=_FACES_PRIMER_RESIZE,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("face gallery load failed: %s", exc)
|
|
return
|
|
|
|
if not entries:
|
|
log.info("face gallery empty — primer skipped (v.%d)", version)
|
|
self._gallery_version_primed = version
|
|
return
|
|
|
|
parts: list[dict[str, Any]] = [{
|
|
"text": (
|
|
"GALLERY PRIMER (do not reply to this turn). "
|
|
"Below are people you know. When the live camera shows one of "
|
|
"them, greet them warmly by name in UAE Khaleeji dialect "
|
|
"(for example: 'هلا والله يا كسام، شحالك؟'), and you may use "
|
|
"the notes about them to make the conversation personal. "
|
|
"For faces NOT in this gallery, welcome them as a guest "
|
|
"without inventing a name. Greet each person only once per "
|
|
"minute to avoid repetition."
|
|
),
|
|
}]
|
|
for entry, jpegs in entries:
|
|
label = (
|
|
f"This person is named {entry.name}."
|
|
if entry.name
|
|
else "This person's name is unknown — greet as guest."
|
|
)
|
|
if entry.description:
|
|
label += f" Notes about them: {entry.description}"
|
|
parts.append({"text": f"\n— {label}"})
|
|
for jpeg in jpegs:
|
|
parts.append({
|
|
"inline_data": {"mime_type": "image/jpeg", "data": jpeg},
|
|
})
|
|
|
|
try:
|
|
await session.send_client_content(
|
|
turns=[{"role": "user", "parts": parts}],
|
|
turn_complete=True,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("primer send failed: %s", exc)
|
|
return
|
|
self._gallery_version_primed = version
|
|
log.info("face gallery primed: %d person(s), v.%d", len(entries), version)
|