Sanadv3/gemini/script.py

792 lines
34 KiB
Python

"""Gemini brain — live conversation loop using the google-genai SDK.
Implements the VoiceBrain contract documented in `voice/model_script.py`:
__init__(audio_io, recorder, voice_name, system_prompt)
async run()
stop()
Owns everything Gemini-specific: the `genai.Client`, `LiveConnectConfig`,
the session connect/receive loop, VAD-based barge-in, echo suppression,
reconnect backoff. Hardware I/O is delegated to `audio_io` and per-turn
WAV capture to `recorder` — both are model-agnostic.
Env overrides:
SANAD_GEMINI_MODEL — Gemini Live model id (without "models/" prefix)
"""
from __future__ import annotations
import array
import asyncio
import base64
import json
import os
import sys
import threading
import time
from pathlib import Path
from typing import Any, Optional
import numpy as np
from google import genai
from google.genai import types
from Project.Sanad.config import (
BASE_DIR,
CHUNK_SIZE,
GEMINI_API_KEY,
GEMINI_VOICE,
RECEIVE_SAMPLE_RATE,
SEND_SAMPLE_RATE,
)
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
from Project.Sanad.vision import recognition_state as _recog_state
log = get_logger("gemini_brain")
_SV = _cfg_section("voice", "sanad_voice")
_VAD = _cfg_section("voice", "vad")
_BI = _cfg_section("voice", "barge_in")
_MODEL = os.environ.get(
"SANAD_GEMINI_MODEL",
"gemini-2.5-flash-native-audio-preview-12-2025",
)
_MIC_GAIN = _SV.get("mic_gain", 1.0)
_SESSION_TIMEOUT = _SV.get("session_timeout_sec", 660)
_MAX_RECONNECT_DELAY = _SV.get("max_reconnect_delay_sec", 30)
_MAX_CONSECUTIVE_ERRORS = _SV.get("max_consecutive_errors", 10)
_NO_MESSAGES_TIMEOUT = _SV.get("no_messages_timeout_sec", 30)
_CHUNK_BYTES = CHUNK_SIZE * 2
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES
# ── Recognition (camera + face gallery) tunables ──
_RECOG_STATE_PATH = Path(os.environ.get(
"SANAD_RECOGNITION_STATE_PATH",
str(BASE_DIR / "data" / ".recognition_state.json"),
))
_VISION_SEND_HZ = float(os.environ.get("SANAD_VISION_SEND_HZ", "2"))
_VISION_STALE_MS = int(os.environ.get("SANAD_VISION_STALE_MS", "1500"))
_RECOG_POLL_S = float(os.environ.get("SANAD_RECOGNITION_POLL_S", "1.0"))
_FACES_DIR = Path(os.environ.get(
"SANAD_FACES_DIR",
str(BASE_DIR / "data" / "faces"),
))
_FACES_MAX_SAMPLES = int(os.environ.get("SANAD_FACES_MAX_SAMPLES", "3"))
_FACES_PRIMER_RESIZE = int(os.environ.get("SANAD_FACES_PRIMER_RESIZE", "256"))
# ── stdin push channel (Marcus pattern) ──────────────────────
# The GeminiSubprocess supervisor writes two line types to this process's
# stdin:
# "frame:<base64-jpeg>\n" — a camera frame to relay to Gemini Live
# "state:<json>\n" — a motion-state update to inject as text
# A daemon thread parses them into the caches below; the asyncio tasks
# _send_frame_loop / _send_state_loop drain those caches.
_LATEST_FRAME_LOCK = threading.Lock()
_LATEST_FRAME: dict = {"bytes": None, "ts": 0.0}
_STATE_LOCK = threading.Lock()
_STATE_PENDING: list[str] = []
_STATE_TAGS = {
"start": "[STATE-START]",
"complete": "[STATE-DONE]",
"interrupted": "[STATE-INTERRUPTED]",
"error": "[STATE-ERROR]",
"paused": "[STATE-PAUSED]",
"resumed": "[STATE-RESUMED]",
}
def _stdin_watcher() -> None:
"""Daemon thread — parse 'frame:' / 'state:' lines off stdin.
Best-effort: any malformed line is skipped. Exits when the parent
closes our stdin (subprocess teardown)."""
try:
for line in sys.stdin:
line = line.rstrip("\n")
if not line:
continue
if line.startswith("frame:"):
b64 = line[len("frame:"):]
try:
data = base64.b64decode(b64)
except Exception:
continue
if data:
with _LATEST_FRAME_LOCK:
_LATEST_FRAME["bytes"] = data
_LATEST_FRAME["ts"] = time.time()
elif line.startswith("state:"):
try:
payload = json.loads(line[len("state:"):])
except Exception:
continue
event = (payload.get("event") or "").strip().lower()
cmd = (payload.get("cmd") or "").strip()
tag = _STATE_TAGS.get(event)
if not tag or not cmd:
continue
msg = f"{tag} {cmd}"
elapsed = payload.get("elapsed_sec")
if isinstance(elapsed, (int, float)):
msg += f" ({float(elapsed):.1f}s)"
reason = payload.get("reason")
if reason and event == "error":
msg += f"{reason}"
with _STATE_LOCK:
_STATE_PENDING.append(msg)
except Exception:
return
# Start the watcher at import time — it blocks harmlessly on sys.stdin
# until the supervisor sends something. Daemon so it never blocks exit.
threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()
def _audio_energy(pcm: bytes) -> int:
try:
samples = array.array("h", pcm)
return sum(abs(s) for s in samples) // len(samples) if samples else 0
except Exception:
return 0
class GeminiBrain:
"""Gemini Live conversation brain — reconnect-safe."""
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
system_prompt: str = ""):
self._audio = audio_io
self._mic = audio_io.mic
self._speaker = audio_io.speaker
self._recorder = recorder
self._voice = voice_name or GEMINI_VOICE
self._system_prompt = system_prompt
self._api_key = GEMINI_API_KEY
self._stop_flag = asyncio.Event()
# per-session state (reset in the outer reconnect loop)
self._speaking = False
self._stream_started = False
self._barge_block_until = 0.0
self._ai_speak_start = 0.0
self._last_ai_audio = 0.0
self._done: Optional[asyncio.Event] = None
# ── Recognition flags — kept in sync with the state file by
# _recognition_state_watcher. Boot defaults come from the file (or
# the SANAD_* env vars if the file is missing).
_initial = _recog_state.read(_RECOG_STATE_PATH)
self._vision_enabled = bool(
_initial.vision_enabled
or os.environ.get("SANAD_VISION_ENABLE", "0") == "1"
)
self._face_rec_enabled = bool(
_initial.face_rec_enabled
or os.environ.get("SANAD_FACE_RECOGNITION_ENABLE", "0") == "1"
)
self._gallery_version_primed = -1 # bumped after first successful primer
def stop(self) -> None:
"""Signal the run loop to exit at the next opportunity."""
try:
self._stop_flag.set()
except Exception:
pass
# ─── public entry point ───────────────────────────────
async def run(self) -> None:
client = genai.Client(api_key=self._api_key)
config = self._build_config()
session_num = 0
start_time = time.time()
consecutive_errors = 0
while not self._stop_flag.is_set():
session_num += 1
self._reset_turn_state()
uptime_min = (time.time() - start_time) / 60
try:
log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
session_num, uptime_min)
async with client.aio.live.connect(model=_MODEL, config=config) as session:
log.info("connected — speak anytime!")
consecutive_errors = 0
self._mic.flush()
self._done = asyncio.Event()
# Reset per-session primer state so re-priming on reconnect
# actually happens. The state watcher will re-prime as soon
# as it sees vision+face-rec enabled.
self._gallery_version_primed = -1
try:
await asyncio.wait_for(
asyncio.gather(
self._send_mic_loop(session),
self._receive_loop(session),
self._send_frame_loop(session),
self._send_state_loop(session),
self._recognition_state_watcher(session),
),
timeout=_SESSION_TIMEOUT,
)
except asyncio.TimeoutError:
log.warning("session timed out after %ds", _SESSION_TIMEOUT)
except asyncio.CancelledError:
log.warning("session cancelled")
log.info("session #%d ended — reconnecting in 1s", session_num)
self._speaker.stop()
self._mic.flush()
await asyncio.sleep(1)
except asyncio.CancelledError:
log.info("cancelled — stopping")
break
except KeyboardInterrupt:
log.info("keyboard interrupt — stopping")
break
except Exception as exc:
consecutive_errors += 1
delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
log.error("session error (#%d): %s — reconnecting in %ds",
consecutive_errors, exc, delay)
await asyncio.sleep(delay)
if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
log.warning("%d consecutive errors — recreating client",
consecutive_errors)
try:
client = genai.Client(api_key=self._api_key)
consecutive_errors = 0
except Exception as ce:
log.error("client recreation failed: %s", ce)
# ─── Gemini config ────────────────────────────────────
def _build_config(self) -> types.LiveConnectConfig:
return types.LiveConnectConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=self._voice,
),
),
),
realtime_input_config=types.RealtimeInputConfig(
automatic_activity_detection=types.AutomaticActivityDetection(
disabled=False,
start_of_speech_sensitivity=getattr(
types.StartSensitivity,
_VAD.get("start_sensitivity", "START_SENSITIVITY_HIGH"),
),
end_of_speech_sensitivity=getattr(
types.EndSensitivity,
_VAD.get("end_sensitivity", "END_SENSITIVITY_LOW"),
),
prefix_padding_ms=_VAD.get("prefix_padding_ms", 20),
silence_duration_ms=_VAD.get("silence_duration_ms", 200),
),
),
input_audio_transcription=types.AudioTranscriptionConfig(),
output_audio_transcription=types.AudioTranscriptionConfig(),
system_instruction=types.Content(
parts=[types.Part(text=self._system_prompt)],
),
)
# ─── state helpers ────────────────────────────────────
def _reset_turn_state(self) -> None:
self._speaking = False
self._stream_started = False
self._barge_block_until = 0.0
self._ai_speak_start = 0.0
self._last_ai_audio = 0.0
def _interrupt(self, source: str = "local") -> None:
self._speaking = False
self._stream_started = False
self._speaker.stop()
self._mic.flush()
self._recorder.finish_turn()
log.info("interrupt (%s)", source)
# ─── mic send loop ────────────────────────────────────
async def _send_mic_loop(self, session: Any) -> None:
threshold = _BI.get("threshold", 500)
chunks_needed = _BI.get("loud_chunks_needed", 3)
cooldown = _BI.get("cooldown_sec", 0.3)
echo_suppress_below = _BI.get("echo_suppress_below", 500)
grace = _BI.get("ai_speak_grace_sec", 0.15)
loop = asyncio.get_event_loop()
loud_count = 0
last_activity = time.time()
while not self._done.is_set() and not self._stop_flag.is_set():
try:
raw = await loop.run_in_executor(
None, self._mic.read_chunk, _CHUNK_BYTES,
)
except Exception:
break
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
data = samples.tobytes()
energy = _audio_energy(data)
now = time.time()
# Barge-in: after AI starts speaking, sustained user energy cuts it.
if self._speaking and now >= self._barge_block_until:
if (now - self._ai_speak_start) >= grace:
if energy > threshold:
loud_count += 1
else:
loud_count = max(0, loud_count - 1)
if loud_count > chunks_needed:
log.info("BARGE-IN (e=%d)", energy)
self._interrupt("barge-in")
loud_count = 0
self._barge_block_until = now + cooldown
# Echo suppression: while AI is speaking, mask quiet frames so the
# mic doesn't feed the model its own voice bleed.
send_data = data
if self._speaking and energy < echo_suppress_below:
send_data = _SILENCE_PCM
# Record user audio when clearly speaking and AI isn't.
if energy > 250 and not self._speaking:
self._recorder.capture_user(data)
# Keep-alive watchdog
if energy > 250:
last_activity = now
elif now - last_activity > 10:
log.info("alive (no speech %.0fs, e=%d)",
now - last_activity, energy)
last_activity = now
try:
await session.send_realtime_input(
audio=types.Blob(
data=send_data,
mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
),
)
except asyncio.CancelledError:
return
except Exception as exc:
log.warning("mic send failed: %s — ending session", exc)
self._done.set()
return
await asyncio.sleep(CHUNK_SIZE / SEND_SAMPLE_RATE)
log.info("send_mic task ended")
# ─── receive loop ─────────────────────────────────────
async def _receive_loop(self, session: Any) -> None:
loop = asyncio.get_event_loop()
try:
last_recv = time.time()
while not self._done.is_set() and not self._stop_flag.is_set():
async for response in session.receive():
last_recv = time.time()
if self._done.is_set():
break
if hasattr(response, "go_away") and response.go_away is not None:
log.info("server going away — will reconnect")
self._done.set()
return
sc = response.server_content
if sc is None:
continue
if sc.interrupted is True:
if self._speaking:
log.info("Gemini interrupted")
self._interrupt("gemini")
continue
if sc.input_transcription:
text = (sc.input_transcription.text or "").strip()
if text and not self._speaking:
log.info("USER: %s", text)
self._recorder.add_user_text(text)
if sc.output_transcription:
text = (sc.output_transcription.text or "").strip()
if text:
log.info("BOT : %s", text)
self._recorder.add_robot_text(text)
if sc.model_turn:
for part in sc.model_turn.parts:
if part.inline_data and part.inline_data.data:
now = time.time()
if not self._speaking:
self._ai_speak_start = now
self._speaking = True
self._last_ai_audio = now
raw_audio = part.inline_data.data
self._recorder.capture_robot(raw_audio)
audio = np.frombuffer(raw_audio, dtype=np.int16)
if not self._stream_started:
await loop.run_in_executor(
None, self._speaker.begin_stream,
)
self._stream_started = True
await loop.run_in_executor(
None, self._speaker.send_chunk,
audio, RECEIVE_SAMPLE_RATE,
)
if sc.turn_complete:
if (self._speaking and self._stream_started
and not self._speaker.interrupted):
log.info("speaker %.1fs", self._speaker.total_sent_sec)
await loop.run_in_executor(
None, self._speaker.wait_finish,
)
elif self._speaking and self._speaker.interrupted:
log.info("speaker interrupted")
self._speaking = False
self._stream_started = False
self._mic.flush()
self._recorder.finish_turn()
log.info("listening")
if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
log.warning("no messages from Gemini for %ds — session dead",
_NO_MESSAGES_TIMEOUT)
break
await asyncio.sleep(0.1)
except Exception as exc:
log.warning("receive ended: %s", exc)
finally:
self._done.set()
# ─── vision-state announcer ───────────────────────────
# Injects the camera state into the live session as text context.
# On a live toggle Gemini is told to say so out loud ("I can see you
# now" / "I can't see you anymore"); at session start it's silent
# standing context so "can you see me?" is answered honestly.
async def _announce_vision_state(self, session: Any, enabled: bool,
is_toggle: bool) -> None:
if is_toggle and enabled:
text = (
"[VISION ON] Your camera was just enabled — you can now see "
"the user through it. Briefly tell them you can see them now, "
"in your normal Khaleeji style (for example: "
"'هلا، الحين أشوفك زين')."
)
elif is_toggle and not enabled:
text = (
"[VISION OFF] Your camera was just disabled — you can no "
"longer see anything. Briefly tell the user you can't see "
"them anymore. If they later ask whether you can see them, "
"tell them to enable the camera from the dashboard."
)
elif enabled: # session start, camera already on
text = (
"[VISION STATUS] Your camera is ON — you can see the user "
"through it. Do not announce this unprompted; just answer "
"naturally if they ask what you see."
)
else: # session start, camera off
text = (
"[VISION STATUS] Your camera is OFF — you cannot see anything "
"right now. If the user asks whether you can see them, tell "
"them to enable the camera from the dashboard. Do not announce "
"this unprompted."
)
try:
await session.send_realtime_input(text=text)
log.info("vision-state injected (enabled=%s, toggle=%s)",
enabled, is_toggle)
except asyncio.CancelledError:
raise
except Exception as exc:
log.warning("vision-state inject failed: %s", exc)
# ─── face-recognition-state announcer ─────────────────
# Same idea as _announce_vision_state, for the face-recognition toggle.
# On a live OFF toggle it also tells Gemini to disregard the gallery —
# so OFF takes effect immediately instead of lingering until reconnect.
async def _announce_facerec_state(self, session: Any, enabled: bool,
is_toggle: bool) -> None:
if is_toggle and enabled:
text = (
"[FACE RECOGNITION ON] Face recognition was just enabled — "
"you'll be shown the people you know in a moment. Briefly "
"tell the user you can now recognise the people you know, in "
"your normal Khaleeji style."
)
elif is_toggle and not enabled:
text = (
"[FACE RECOGNITION OFF] Face recognition was just disabled. "
"Disregard the face gallery you were given earlier — stop "
"greeting people by name and do not identify anyone. Briefly "
"tell the user you'll no longer recognise faces."
)
elif enabled: # session start, face rec already on
text = (
"[FACE RECOGNITION STATUS] Face recognition is ON — when you "
"see someone you've been shown in the gallery, greet them by "
"name. Do not announce this unprompted."
)
else: # session start, face rec off
text = (
"[FACE RECOGNITION STATUS] Face recognition is OFF — you "
"cannot identify people. If the user asks who someone is or "
"whether you recognise them, tell them to enable face "
"recognition from the dashboard. Do not announce this "
"unprompted."
)
try:
await session.send_realtime_input(text=text)
log.info("face-rec-state injected (enabled=%s, toggle=%s)",
enabled, is_toggle)
except asyncio.CancelledError:
raise
except Exception as exc:
log.warning("face-rec-state inject failed: %s", exc)
# ─── recognition state watcher ────────────────────────
# Polls data/.recognition_state.json at SANAD_RECOGNITION_POLL_S Hz and
# mirrors vision_enabled / face_rec_enabled into in-memory flags so the
# rest of the session can react WITHOUT a Gemini reconnect.
async def _recognition_state_watcher(self, session: Any) -> None:
last_mtime = 0.0
last_state = _recog_state.RecognitionState(
vision_enabled=self._vision_enabled,
face_rec_enabled=self._face_rec_enabled,
gallery_version=self._gallery_version_primed,
)
# Best-effort initial primer if face_rec is already on at session start.
if self._face_rec_enabled and self._vision_enabled:
try:
cur = _recog_state.read(_RECOG_STATE_PATH)
await self._send_gallery_primer(session, cur.gallery_version)
except Exception as exc:
log.warning("initial gallery primer failed: %s", exc)
# Tell Gemini the current camera + face-recognition state at session
# start — silent standing context so "can you see me?" / "do you know
# who I am?" are answered honestly even if the user never toggles.
await self._announce_vision_state(
session, self._vision_enabled, is_toggle=False,
)
await self._announce_facerec_state(
session, self._face_rec_enabled, is_toggle=False,
)
while not self._done.is_set() and not self._stop_flag.is_set():
await asyncio.sleep(_RECOG_POLL_S)
try:
st = _RECOG_STATE_PATH.stat()
except FileNotFoundError:
continue
except Exception:
continue
if st.st_mtime == last_mtime:
continue
last_mtime = st.st_mtime
new_state = _recog_state.read(_RECOG_STATE_PATH)
# Vision toggle — instant. Announce it out loud so Gemini reacts
# ("I can see you now" / "I can't see you anymore").
if new_state.vision_enabled != last_state.vision_enabled:
self._vision_enabled = new_state.vision_enabled
log.info("vision toggled → %s", self._vision_enabled)
await self._announce_vision_state(
session, self._vision_enabled, is_toggle=True,
)
# Face-rec toggle — announce it out loud. The OFF announcement
# also tells Gemini to disregard the gallery, so OFF takes effect
# immediately instead of lingering until the next reconnect.
if new_state.face_rec_enabled != last_state.face_rec_enabled:
self._face_rec_enabled = new_state.face_rec_enabled
if self._face_rec_enabled:
log.info("face rec enabled — announcing + sending primer")
else:
log.info("face rec disabled — telling Gemini to "
"disregard the gallery")
await self._announce_facerec_state(
session, self._face_rec_enabled, is_toggle=True,
)
# Conditions for re-priming:
# - face_rec just turned ON (no_face_rec_before)
# - gallery version bumped since the last primer
face_rec_just_on = (
new_state.face_rec_enabled and not last_state.face_rec_enabled
)
gallery_changed = (
new_state.gallery_version != self._gallery_version_primed
)
if (self._face_rec_enabled
and (face_rec_just_on or gallery_changed)
and self._vision_enabled):
try:
await self._send_gallery_primer(
session, new_state.gallery_version,
)
except Exception as exc:
log.warning("gallery primer failed: %s", exc)
last_state = new_state
# ─── camera frame send loop ───────────────────────────
# Reads the latest JPEG from the _LATEST_FRAME cache (fed by the
# _stdin_watcher thread, which the GeminiSubprocess supervisor pushes
# 'frame:<b64>' lines into) and relays it to Gemini Live at
# _VISION_SEND_HZ. Only active when self._vision_enabled. Skips frames
# older than _VISION_STALE_MS so a stopped/unplugged camera doesn't
# waste tokens on a frozen scene.
async def _send_frame_loop(self, session: Any) -> None:
period = 1.0 / max(0.5, _VISION_SEND_HZ)
stale_s = _VISION_STALE_MS / 1000.0
backoff = 0.0
last_sent_ts = 0.0
while not self._done.is_set() and not self._stop_flag.is_set():
await asyncio.sleep(max(period, backoff))
if not self._vision_enabled:
continue
with _LATEST_FRAME_LOCK:
data = _LATEST_FRAME.get("bytes")
ts = _LATEST_FRAME.get("ts", 0.0)
if not data:
continue
# Stale — supervisor stopped pushing (camera off / unplugged).
if (time.time() - ts) > stale_s:
continue
# De-dup — don't re-send a frame we already relayed.
if ts == last_sent_ts:
continue
try:
await session.send_realtime_input(
video=types.Blob(data=data, mime_type="image/jpeg"),
)
last_sent_ts = ts
backoff = 0.0
except asyncio.CancelledError:
return
except Exception as exc:
log.warning("frame send failed: %s", exc)
backoff = min(backoff * 2 + 0.5, 5.0)
# ─── motion-state inject loop ─────────────────────────
# Drains _STATE_PENDING (fed by the _stdin_watcher from 'state:' lines
# the supervisor pushes when the arm starts/finishes/errors a motion)
# and injects each as silent text context into the live session, so
# Gemini can answer "what are you doing?" honestly. Per persona, Gemini
# reads these for context but does not narrate them unprompted.
async def _send_state_loop(self, session: Any) -> None:
while not self._done.is_set() and not self._stop_flag.is_set():
await asyncio.sleep(0.1)
with _STATE_LOCK:
if not _STATE_PENDING:
continue
pending = list(_STATE_PENDING)
_STATE_PENDING.clear()
for msg in pending:
try:
await session.send_realtime_input(text=msg)
log.info("STATE injected: %s", msg)
except asyncio.CancelledError:
return
except Exception as exc:
# Some SDK versions may not accept text on
# send_realtime_input — log once-ish and keep going;
# motion still works, only this context channel is lost.
log.warning("state inject failed: %s", exc)
# ─── face gallery primer ──────────────────────────────
# Builds one multimodal turn carrying the entire face gallery + a Khaleeji
# greeting instruction, and sends it via send_client_content. Gemini keeps
# this in session context until reconnect. Re-sent on gallery_version bumps.
async def _send_gallery_primer(self, session: Any, version: int) -> None:
try:
from Project.Sanad.vision.face_gallery import FaceGallery
except Exception as exc:
log.info("face gallery module unavailable: %s", exc)
return
gallery = FaceGallery(_FACES_DIR)
try:
entries = gallery.load_for_primer(
max_samples_per_face=_FACES_MAX_SAMPLES,
resize_long_side=_FACES_PRIMER_RESIZE,
)
except Exception as exc:
log.warning("face gallery load failed: %s", exc)
return
if not entries:
log.info("face gallery empty — primer skipped (v.%d)", version)
self._gallery_version_primed = version
return
parts: list[dict[str, Any]] = [{
"text": (
"GALLERY PRIMER (do not reply to this turn). "
"Below are people you know. When the live camera shows one of "
"them, greet them warmly by name in UAE Khaleeji dialect "
"(for example: 'هلا والله يا كسام، شحالك؟'), and you may use "
"the notes about them to make the conversation personal. "
"For faces NOT in this gallery, welcome them as a guest "
"without inventing a name. Greet each person only once per "
"minute to avoid repetition."
),
}]
for entry, jpegs in entries:
label = (
f"This person is named {entry.name}."
if entry.name
else "This person's name is unknown — greet as guest."
)
if entry.description:
label += f" Notes about them: {entry.description}"
parts.append({"text": f"\n{label}"})
for jpeg in jpegs:
parts.append({
"inline_data": {"mime_type": "image/jpeg", "data": jpeg},
})
try:
await session.send_client_content(
turns=[{"role": "user", "parts": parts}],
turn_complete=True,
)
except Exception as exc:
log.warning("primer send failed: %s", exc)
return
self._gallery_version_primed = version
log.info("face gallery primed: %d person(s), v.%d", len(entries), version)