1291 lines
57 KiB
Python
1291 lines
57 KiB
Python
"""Gemini brain — live conversation loop using the google-genai SDK.
|
|
|
|
Implements the VoiceBrain contract documented in `voice/model_script.py`:
|
|
|
|
__init__(audio_io, recorder, voice_name, system_prompt)
|
|
async run()
|
|
stop()
|
|
|
|
Owns everything Gemini-specific: the `genai.Client`, `LiveConnectConfig`,
|
|
the session connect/receive loop, VAD-based barge-in, echo suppression,
|
|
reconnect backoff. Hardware I/O is delegated to `audio_io` and per-turn
|
|
WAV capture to `recorder` — both are model-agnostic.
|
|
|
|
Env overrides:
|
|
SANAD_GEMINI_MODEL — Gemini Live model id (without "models/" prefix)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import array
|
|
import asyncio
|
|
import base64
|
|
import json
|
|
import os
|
|
import sys
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import numpy as np
|
|
|
|
from google import genai
|
|
from google.genai import types
|
|
|
|
from Project.Sanad.config import (
|
|
BASE_DIR,
|
|
CHUNK_SIZE,
|
|
GEMINI_API_KEY,
|
|
GEMINI_VOICE,
|
|
RECEIVE_SAMPLE_RATE,
|
|
SEND_SAMPLE_RATE,
|
|
)
|
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
|
from Project.Sanad.core.logger import get_logger
|
|
from Project.Sanad.vision import recognition_state as _recog_state
|
|
|
|
log = get_logger("gemini_brain")
|
|
|
|
_SV = _cfg_section("voice", "sanad_voice")
|
|
_VAD = _cfg_section("voice", "vad")
|
|
_BI = _cfg_section("voice", "barge_in")
|
|
|
|
_MODEL = os.environ.get(
|
|
"SANAD_GEMINI_MODEL",
|
|
"gemini-2.5-flash-native-audio-preview-12-2025",
|
|
)
|
|
_MIC_GAIN = _SV.get("mic_gain", 1.0)
|
|
_SESSION_TIMEOUT = _SV.get("session_timeout_sec", 660)
|
|
_MAX_RECONNECT_DELAY = _SV.get("max_reconnect_delay_sec", 30)
|
|
_MAX_CONSECUTIVE_ERRORS = _SV.get("max_consecutive_errors", 10)
|
|
_NO_MESSAGES_TIMEOUT = _SV.get("no_messages_timeout_sec", 30)
|
|
|
|
_CHUNK_BYTES = CHUNK_SIZE * 2
|
|
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES
|
|
|
|
# ── Recognition (camera + face gallery) tunables ──
|
|
_RECOG_STATE_PATH = Path(os.environ.get(
|
|
"SANAD_RECOGNITION_STATE_PATH",
|
|
str(BASE_DIR / "data" / ".recognition_state.json"),
|
|
))
|
|
_VISION_SEND_HZ = float(os.environ.get("SANAD_VISION_SEND_HZ", "2"))
|
|
_VISION_STALE_MS = int(os.environ.get("SANAD_VISION_STALE_MS", "1500"))
|
|
_RECOG_POLL_S = float(os.environ.get("SANAD_RECOGNITION_POLL_S", "1.0"))
|
|
_FACES_DIR = Path(os.environ.get(
|
|
"SANAD_FACES_DIR",
|
|
str(BASE_DIR / "data" / "faces"),
|
|
))
|
|
_FACES_MAX_SAMPLES = int(os.environ.get("SANAD_FACES_MAX_SAMPLES", "3"))
|
|
_FACES_PRIMER_RESIZE = int(os.environ.get("SANAD_FACES_PRIMER_RESIZE", "256"))
|
|
# N3 — zones gallery (zone → place → linked faces). Folded into a Gemini
|
|
# primer turn so Gemini can recognise / talk about known locations and the
|
|
# people associated with them.
|
|
_ZONES_DIR = Path(os.environ.get(
|
|
"SANAD_ZONES_DIR",
|
|
str(BASE_DIR / "data" / "zones"),
|
|
))
|
|
|
|
|
|
# ── stdin push channel (Marcus pattern) ──────────────────────
|
|
# The GeminiSubprocess supervisor writes two line types to this process's
|
|
# stdin:
|
|
# "frame:<base64-jpeg>\n" — a camera frame to relay to Gemini Live
|
|
# "state:<json>\n" — a motion-state update to inject as text
|
|
# A daemon thread parses them into the caches below; the asyncio tasks
|
|
# _send_frame_loop / _send_state_loop drain those caches.
|
|
|
|
_LATEST_FRAME_LOCK = threading.Lock()
|
|
_LATEST_FRAME: dict = {"bytes": None, "ts": 0.0}
|
|
|
|
_STATE_LOCK = threading.Lock()
|
|
_STATE_PENDING: list[str] = []
|
|
|
|
_STATE_TAGS = {
|
|
"start": "[STATE-START]",
|
|
"complete": "[STATE-DONE]",
|
|
"interrupted": "[STATE-INTERRUPTED]",
|
|
"error": "[STATE-ERROR]",
|
|
"paused": "[STATE-PAUSED]",
|
|
"resumed": "[STATE-RESUMED]",
|
|
}
|
|
|
|
# Pending audio-profile swap signalled by the parent over "profile:" stdin
|
|
# lines. _audio_swap_loop drains it inside the brain's asyncio loop.
|
|
_PROFILE_LOCK = threading.Lock()
|
|
_PROFILE_PENDING: dict = {"id": None, "reason": ""}
|
|
|
|
_VALID_PROFILES = {"builtin", "anker", "hollyland_builtin"}
|
|
|
|
|
|
def _stdin_watcher() -> None:
|
|
"""Daemon thread — parse 'frame:' / 'state:' / 'profile:' lines off stdin.
|
|
|
|
Best-effort: any malformed line is skipped. Exits when the parent
|
|
closes our stdin (subprocess teardown)."""
|
|
try:
|
|
for line in sys.stdin:
|
|
line = line.rstrip("\n")
|
|
if not line:
|
|
continue
|
|
if line.startswith("frame:"):
|
|
b64 = line[len("frame:"):]
|
|
try:
|
|
data = base64.b64decode(b64)
|
|
except Exception:
|
|
continue
|
|
if data:
|
|
with _LATEST_FRAME_LOCK:
|
|
_LATEST_FRAME["bytes"] = data
|
|
_LATEST_FRAME["ts"] = time.time()
|
|
elif line.startswith("state:"):
|
|
try:
|
|
payload = json.loads(line[len("state:"):])
|
|
except Exception:
|
|
continue
|
|
event = (payload.get("event") or "").strip().lower()
|
|
cmd = (payload.get("cmd") or "").strip()
|
|
tag = _STATE_TAGS.get(event)
|
|
if not tag or not cmd:
|
|
continue
|
|
msg = f"{tag} {cmd}"
|
|
elapsed = payload.get("elapsed_sec")
|
|
if isinstance(elapsed, (int, float)):
|
|
msg += f" ({float(elapsed):.1f}s)"
|
|
reason = payload.get("reason")
|
|
if reason and event == "error":
|
|
msg += f" — {reason}"
|
|
with _STATE_LOCK:
|
|
_STATE_PENDING.append(msg)
|
|
elif line.startswith("profile:"):
|
|
# Parent signals an audio-profile hot-swap. Stash the target;
|
|
# _audio_swap_loop (asyncio task) handles the actual swap so
|
|
# PyAudio open/close happens off the stdin thread.
|
|
try:
|
|
payload = json.loads(line[len("profile:"):])
|
|
except Exception:
|
|
continue
|
|
pid = (payload.get("id") or "").strip().lower()
|
|
if pid not in _VALID_PROFILES:
|
|
continue
|
|
with _PROFILE_LOCK:
|
|
_PROFILE_PENDING["id"] = pid
|
|
_PROFILE_PENDING["reason"] = (
|
|
payload.get("reason") or "").strip()
|
|
except Exception:
|
|
return
|
|
|
|
|
|
# Start the watcher at import time — it blocks harmlessly on sys.stdin
|
|
# until the supervisor sends something. Daemon so it never blocks exit.
|
|
threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()
|
|
|
|
|
|
def _audio_energy(pcm: bytes) -> int:
|
|
try:
|
|
samples = array.array("h", pcm)
|
|
return sum(abs(s) for s in samples) // len(samples) if samples else 0
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
class GeminiBrain:
|
|
"""Gemini Live conversation brain — reconnect-safe."""
|
|
|
|
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
|
|
system_prompt: str = ""):
|
|
self._audio = audio_io
|
|
self._mic = audio_io.mic
|
|
self._speaker = audio_io.speaker
|
|
# Kept on the brain so swap_audio_devices() can rebuild profiles that
|
|
# need DDS (`builtin`, `hollyland_builtin`) without re-init'ing.
|
|
self._audio_client = getattr(audio_io, "_audio_client", None)
|
|
# Current profile id (driven by the parent's "profile:" stdin push).
|
|
# Defaults to whatever audio_io was constructed with — `from_profile`
|
|
# sets profile_id; if SANAD_AUDIO_PROFILE override is in env, that
|
|
# value matches.
|
|
self._current_profile_id = getattr(audio_io, "profile_id", None) \
|
|
or os.environ.get("SANAD_AUDIO_PROFILE", "builtin").strip().lower()
|
|
# Coordinates concurrent swap requests so two pending profile
|
|
# changes don't interleave mid-tear-down.
|
|
self._swap_lock: Optional[asyncio.Lock] = None # built in run()
|
|
self._recorder = recorder
|
|
self._voice = voice_name or GEMINI_VOICE
|
|
self._system_prompt = system_prompt
|
|
self._api_key = GEMINI_API_KEY
|
|
self._stop_flag = asyncio.Event()
|
|
# per-session state (reset in the outer reconnect loop)
|
|
self._speaking = False
|
|
self._stream_started = False
|
|
self._barge_block_until = 0.0
|
|
self._ai_speak_start = 0.0
|
|
self._last_ai_audio = 0.0
|
|
self._done: Optional[asyncio.Event] = None
|
|
# ── Recognition flags — kept in sync with the state file by
|
|
# _recognition_state_watcher. Boot defaults come from the file (or
|
|
# the SANAD_* env vars if the file is missing).
|
|
_initial = _recog_state.read(_RECOG_STATE_PATH)
|
|
self._vision_enabled = bool(
|
|
_initial.vision_enabled
|
|
or os.environ.get("SANAD_VISION_ENABLE", "0") == "1"
|
|
)
|
|
self._face_rec_enabled = bool(
|
|
_initial.face_rec_enabled
|
|
or os.environ.get("SANAD_FACE_RECOGNITION_ENABLE", "0") == "1"
|
|
)
|
|
self._gallery_version_primed = -1 # bumped after first successful primer
|
|
# N3 — zones knowledge toggle + primer version tracking.
|
|
self._zone_rec_enabled = bool(
|
|
_initial.zone_rec_enabled
|
|
or os.environ.get("SANAD_ZONE_RECOGNITION_ENABLE", "0") == "1"
|
|
)
|
|
self._zones_version_primed = -1
|
|
# "Go here" destination already announced this session (zone_id, place_id).
|
|
self._nav_target = (
|
|
int(_initial.nav_target_zone_id), int(_initial.nav_target_place_id),
|
|
)
|
|
# N2 — Gemini-driven locomotion enable gate (announce only; the
|
|
# actual dispatch loop lives in the parent and is wired separately).
|
|
self._movement_enabled = bool(
|
|
_initial.movement_enabled
|
|
or os.environ.get("SANAD_MOVEMENT_ENABLE", "0") == "1"
|
|
)
|
|
|
|
def stop(self) -> None:
|
|
"""Signal the run loop to exit at the next opportunity."""
|
|
try:
|
|
self._stop_flag.set()
|
|
except Exception:
|
|
pass
|
|
|
|
# ─── public entry point ───────────────────────────────
|
|
|
|
async def run(self) -> None:
|
|
client = genai.Client(api_key=self._api_key)
|
|
config = self._build_config()
|
|
session_num = 0
|
|
start_time = time.time()
|
|
consecutive_errors = 0
|
|
|
|
while not self._stop_flag.is_set():
|
|
session_num += 1
|
|
self._reset_turn_state()
|
|
uptime_min = (time.time() - start_time) / 60
|
|
|
|
try:
|
|
log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
|
|
session_num, uptime_min)
|
|
async with client.aio.live.connect(model=_MODEL, config=config) as session:
|
|
log.info("connected — speak anytime!")
|
|
consecutive_errors = 0
|
|
self._mic.flush()
|
|
self._done = asyncio.Event()
|
|
# Reset per-session primer state so re-priming on reconnect
|
|
# actually happens. The state watcher will re-prime as soon
|
|
# as it sees vision+face-rec (and place-rec) enabled.
|
|
self._gallery_version_primed = -1
|
|
self._zones_version_primed = -1
|
|
# Re-announce the active destination on reconnect.
|
|
self._nav_target = (-1, -1)
|
|
# Lazy-build the swap lock on the active asyncio loop.
|
|
if self._swap_lock is None:
|
|
self._swap_lock = asyncio.Lock()
|
|
|
|
try:
|
|
await asyncio.wait_for(
|
|
asyncio.gather(
|
|
self._send_mic_loop(session),
|
|
self._receive_loop(session),
|
|
self._send_frame_loop(session),
|
|
self._send_state_loop(session),
|
|
self._recognition_state_watcher(session),
|
|
self._audio_swap_loop(session),
|
|
),
|
|
timeout=_SESSION_TIMEOUT,
|
|
)
|
|
except asyncio.TimeoutError:
|
|
log.warning("session timed out after %ds", _SESSION_TIMEOUT)
|
|
except asyncio.CancelledError:
|
|
log.warning("session cancelled")
|
|
|
|
log.info("session #%d ended — reconnecting in 1s", session_num)
|
|
self._speaker.stop()
|
|
self._mic.flush()
|
|
await asyncio.sleep(1)
|
|
|
|
except asyncio.CancelledError:
|
|
log.info("cancelled — stopping")
|
|
break
|
|
except KeyboardInterrupt:
|
|
log.info("keyboard interrupt — stopping")
|
|
break
|
|
except Exception as exc:
|
|
consecutive_errors += 1
|
|
delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
|
|
log.error("session error (#%d): %s — reconnecting in %ds",
|
|
consecutive_errors, exc, delay)
|
|
await asyncio.sleep(delay)
|
|
if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
|
|
log.warning("%d consecutive errors — recreating client",
|
|
consecutive_errors)
|
|
try:
|
|
client = genai.Client(api_key=self._api_key)
|
|
consecutive_errors = 0
|
|
except Exception as ce:
|
|
log.error("client recreation failed: %s", ce)
|
|
|
|
# ─── Gemini config ────────────────────────────────────
|
|
|
|
def _build_config(self) -> types.LiveConnectConfig:
|
|
return types.LiveConnectConfig(
|
|
response_modalities=["AUDIO"],
|
|
speech_config=types.SpeechConfig(
|
|
voice_config=types.VoiceConfig(
|
|
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
|
voice_name=self._voice,
|
|
),
|
|
),
|
|
),
|
|
realtime_input_config=types.RealtimeInputConfig(
|
|
automatic_activity_detection=types.AutomaticActivityDetection(
|
|
disabled=False,
|
|
start_of_speech_sensitivity=getattr(
|
|
types.StartSensitivity,
|
|
_VAD.get("start_sensitivity", "START_SENSITIVITY_HIGH"),
|
|
),
|
|
end_of_speech_sensitivity=getattr(
|
|
types.EndSensitivity,
|
|
_VAD.get("end_sensitivity", "END_SENSITIVITY_LOW"),
|
|
),
|
|
prefix_padding_ms=_VAD.get("prefix_padding_ms", 20),
|
|
silence_duration_ms=_VAD.get("silence_duration_ms", 200),
|
|
),
|
|
),
|
|
input_audio_transcription=types.AudioTranscriptionConfig(),
|
|
output_audio_transcription=types.AudioTranscriptionConfig(),
|
|
system_instruction=types.Content(
|
|
parts=[types.Part(text=self._system_prompt)],
|
|
),
|
|
)
|
|
|
|
# ─── state helpers ────────────────────────────────────
|
|
|
|
def _reset_turn_state(self) -> None:
|
|
self._speaking = False
|
|
self._stream_started = False
|
|
self._barge_block_until = 0.0
|
|
self._ai_speak_start = 0.0
|
|
self._last_ai_audio = 0.0
|
|
|
|
def _interrupt(self, source: str = "local") -> None:
|
|
self._speaking = False
|
|
self._stream_started = False
|
|
self._speaker.stop()
|
|
self._mic.flush()
|
|
self._recorder.finish_turn()
|
|
log.info("interrupt (%s)", source)
|
|
|
|
# ─── mic send loop ────────────────────────────────────
|
|
|
|
async def _send_mic_loop(self, session: Any) -> None:
|
|
threshold = _BI.get("threshold", 500)
|
|
chunks_needed = _BI.get("loud_chunks_needed", 3)
|
|
cooldown = _BI.get("cooldown_sec", 0.3)
|
|
echo_suppress_below = _BI.get("echo_suppress_below", 500)
|
|
grace = _BI.get("ai_speak_grace_sec", 0.15)
|
|
|
|
loop = asyncio.get_event_loop()
|
|
loud_count = 0
|
|
last_activity = time.time()
|
|
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
try:
|
|
raw = await loop.run_in_executor(
|
|
None, self._mic.read_chunk, _CHUNK_BYTES,
|
|
)
|
|
except Exception:
|
|
break
|
|
|
|
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
|
|
samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
|
|
data = samples.tobytes()
|
|
energy = _audio_energy(data)
|
|
now = time.time()
|
|
|
|
# Barge-in: after AI starts speaking, sustained user energy cuts it.
|
|
if self._speaking and now >= self._barge_block_until:
|
|
if (now - self._ai_speak_start) >= grace:
|
|
if energy > threshold:
|
|
loud_count += 1
|
|
else:
|
|
loud_count = max(0, loud_count - 1)
|
|
if loud_count > chunks_needed:
|
|
log.info("BARGE-IN (e=%d)", energy)
|
|
self._interrupt("barge-in")
|
|
loud_count = 0
|
|
self._barge_block_until = now + cooldown
|
|
|
|
# Echo suppression: while AI is speaking, mask quiet frames so the
|
|
# mic doesn't feed the model its own voice bleed.
|
|
send_data = data
|
|
if self._speaking and energy < echo_suppress_below:
|
|
send_data = _SILENCE_PCM
|
|
|
|
# Record user audio when clearly speaking and AI isn't.
|
|
if energy > 250 and not self._speaking:
|
|
self._recorder.capture_user(data)
|
|
|
|
# Keep-alive watchdog
|
|
if energy > 250:
|
|
last_activity = now
|
|
elif now - last_activity > 10:
|
|
log.info("alive (no speech %.0fs, e=%d)",
|
|
now - last_activity, energy)
|
|
last_activity = now
|
|
|
|
try:
|
|
await session.send_realtime_input(
|
|
audio=types.Blob(
|
|
data=send_data,
|
|
mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
|
|
),
|
|
)
|
|
except asyncio.CancelledError:
|
|
return
|
|
except Exception as exc:
|
|
log.warning("mic send failed: %s — ending session", exc)
|
|
self._done.set()
|
|
return
|
|
|
|
await asyncio.sleep(CHUNK_SIZE / SEND_SAMPLE_RATE)
|
|
|
|
log.info("send_mic task ended")
|
|
|
|
# ─── receive loop ─────────────────────────────────────
|
|
|
|
async def _receive_loop(self, session: Any) -> None:
|
|
loop = asyncio.get_event_loop()
|
|
try:
|
|
last_recv = time.time()
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
async for response in session.receive():
|
|
last_recv = time.time()
|
|
if self._done.is_set():
|
|
break
|
|
|
|
if hasattr(response, "go_away") and response.go_away is not None:
|
|
log.info("server going away — will reconnect")
|
|
self._done.set()
|
|
return
|
|
|
|
sc = response.server_content
|
|
if sc is None:
|
|
continue
|
|
|
|
if sc.interrupted is True:
|
|
if self._speaking:
|
|
log.info("Gemini interrupted")
|
|
self._interrupt("gemini")
|
|
continue
|
|
|
|
if sc.input_transcription:
|
|
text = (sc.input_transcription.text or "").strip()
|
|
if text and not self._speaking:
|
|
log.info("USER: %s", text)
|
|
self._recorder.add_user_text(text)
|
|
|
|
if sc.output_transcription:
|
|
text = (sc.output_transcription.text or "").strip()
|
|
if text:
|
|
# Emit as "BOT:" (no space before colon) so the
|
|
# supervisor's _track_line can parse it the same
|
|
# way it parses "USER:" — this is the channel the
|
|
# movement dispatcher (N2) reads Gemini's own
|
|
# spoken phrases from. Keep in lock-step with
|
|
# GeminiSubprocess._track_line.
|
|
log.info("BOT: %s", text)
|
|
self._recorder.add_robot_text(text)
|
|
|
|
if sc.model_turn:
|
|
for part in sc.model_turn.parts:
|
|
if part.inline_data and part.inline_data.data:
|
|
now = time.time()
|
|
if not self._speaking:
|
|
self._ai_speak_start = now
|
|
self._speaking = True
|
|
self._last_ai_audio = now
|
|
raw_audio = part.inline_data.data
|
|
self._recorder.capture_robot(raw_audio)
|
|
audio = np.frombuffer(raw_audio, dtype=np.int16)
|
|
if not self._stream_started:
|
|
await loop.run_in_executor(
|
|
None, self._speaker.begin_stream,
|
|
)
|
|
self._stream_started = True
|
|
await loop.run_in_executor(
|
|
None, self._speaker.send_chunk,
|
|
audio, RECEIVE_SAMPLE_RATE,
|
|
)
|
|
|
|
if sc.turn_complete:
|
|
if (self._speaking and self._stream_started
|
|
and not self._speaker.interrupted):
|
|
log.info("speaker %.1fs", self._speaker.total_sent_sec)
|
|
await loop.run_in_executor(
|
|
None, self._speaker.wait_finish,
|
|
)
|
|
elif self._speaking and self._speaker.interrupted:
|
|
log.info("speaker interrupted")
|
|
self._speaking = False
|
|
self._stream_started = False
|
|
self._mic.flush()
|
|
self._recorder.finish_turn()
|
|
log.info("listening")
|
|
|
|
if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
|
|
log.warning("no messages from Gemini for %ds — session dead",
|
|
_NO_MESSAGES_TIMEOUT)
|
|
break
|
|
await asyncio.sleep(0.1)
|
|
|
|
except Exception as exc:
|
|
log.warning("receive ended: %s", exc)
|
|
finally:
|
|
self._done.set()
|
|
|
|
# ─── vision-state announcer ───────────────────────────
|
|
# Injects the camera state into the live session as text context.
|
|
# On a live toggle Gemini is told to say so out loud ("I can see you
|
|
# now" / "I can't see you anymore"); at session start it's silent
|
|
# standing context so "can you see me?" is answered honestly.
|
|
|
|
async def _announce_vision_state(self, session: Any, enabled: bool,
|
|
is_toggle: bool) -> None:
|
|
if is_toggle and enabled:
|
|
text = (
|
|
"[VISION ON] Your camera was just enabled — you can now see "
|
|
"the user through it. Briefly tell them you can see them now, "
|
|
"in your normal Khaleeji style (for example: "
|
|
"'هلا، الحين أشوفك زين')."
|
|
)
|
|
elif is_toggle and not enabled:
|
|
text = (
|
|
"[VISION OFF] Your camera was just disabled — you can no "
|
|
"longer see anything. Briefly tell the user you can't see "
|
|
"them anymore. If they later ask whether you can see them, "
|
|
"tell them to enable the camera from the dashboard."
|
|
)
|
|
elif enabled: # session start, camera already on
|
|
text = (
|
|
"[VISION STATUS] Your camera is ON — you can see the user "
|
|
"through it. Do not announce this unprompted; just answer "
|
|
"naturally if they ask what you see."
|
|
)
|
|
else: # session start, camera off
|
|
text = (
|
|
"[VISION STATUS] Your camera is OFF — you cannot see anything "
|
|
"right now. If the user asks whether you can see them, tell "
|
|
"them to enable the camera from the dashboard. Do not announce "
|
|
"this unprompted."
|
|
)
|
|
try:
|
|
await session.send_realtime_input(text=text)
|
|
log.info("vision-state injected (enabled=%s, toggle=%s)",
|
|
enabled, is_toggle)
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("vision-state inject failed: %s", exc)
|
|
|
|
# ─── face-recognition-state announcer ─────────────────
|
|
# Same idea as _announce_vision_state, for the face-recognition toggle.
|
|
# On a live OFF toggle it also tells Gemini to disregard the gallery —
|
|
# so OFF takes effect immediately instead of lingering until reconnect.
|
|
|
|
async def _announce_facerec_state(self, session: Any, enabled: bool,
|
|
is_toggle: bool) -> None:
|
|
if is_toggle and enabled:
|
|
text = (
|
|
"[FACE RECOGNITION ON] Face recognition was just enabled — "
|
|
"you'll be shown the people you know in a moment. Briefly "
|
|
"tell the user you can now recognise the people you know, in "
|
|
"your normal Khaleeji style."
|
|
)
|
|
elif is_toggle and not enabled:
|
|
text = (
|
|
"[FACE RECOGNITION OFF] Face recognition was just disabled. "
|
|
"Disregard the face gallery you were given earlier — stop "
|
|
"greeting people by name and do not identify anyone. Briefly "
|
|
"tell the user you'll no longer recognise faces."
|
|
)
|
|
elif enabled: # session start, face rec already on
|
|
text = (
|
|
"[FACE RECOGNITION STATUS] Face recognition is ON — when you "
|
|
"see someone you've been shown in the gallery, greet them by "
|
|
"name. Do not announce this unprompted."
|
|
)
|
|
else: # session start, face rec off
|
|
text = (
|
|
"[FACE RECOGNITION STATUS] Face recognition is OFF — you "
|
|
"cannot identify people. If the user asks who someone is or "
|
|
"whether you recognise them, tell them to enable face "
|
|
"recognition from the dashboard. Do not announce this "
|
|
"unprompted."
|
|
)
|
|
try:
|
|
await session.send_realtime_input(text=text)
|
|
log.info("face-rec-state injected (enabled=%s, toggle=%s)",
|
|
enabled, is_toggle)
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("face-rec-state inject failed: %s", exc)
|
|
|
|
# ─── place-recognition-state announcer (N3) ───────────
|
|
# Same idea as _announce_facerec_state, for the places-gallery toggle.
|
|
# On a live OFF toggle it also tells Gemini to disregard the places it
|
|
# was given so OFF takes effect immediately instead of lingering.
|
|
|
|
async def _announce_zonerec_state(self, session: Any, enabled: bool,
|
|
is_toggle: bool) -> None:
|
|
if is_toggle and enabled:
|
|
text = (
|
|
"[ZONE RECOGNITION ON] You were just given the zones and places "
|
|
"you know (and the people associated with them). Briefly tell "
|
|
"the user you now know your way around, in your normal Khaleeji "
|
|
"style."
|
|
)
|
|
elif is_toggle and not enabled:
|
|
text = (
|
|
"[ZONE RECOGNITION OFF] Zone recognition was just disabled. "
|
|
"Disregard the zones and places you were given earlier — stop "
|
|
"naming rooms or locations. Briefly tell the user you'll no "
|
|
"longer recognise places."
|
|
)
|
|
elif enabled: # session start, zone rec already on
|
|
text = (
|
|
"[ZONE RECOGNITION STATUS] Zone recognition is ON — when you see "
|
|
"or are asked about a zone/place you've been told about, you may "
|
|
"name it and use its description. Do not announce this "
|
|
"unprompted."
|
|
)
|
|
else: # session start, zone rec off
|
|
text = (
|
|
"[ZONE RECOGNITION STATUS] Zone recognition is OFF — you do not "
|
|
"know any specific zones or places. If the user asks where they "
|
|
"are or to go somewhere by name, tell them to enable zone "
|
|
"recognition from the dashboard. Do not announce this "
|
|
"unprompted."
|
|
)
|
|
try:
|
|
await session.send_realtime_input(text=text)
|
|
log.info("zone-rec-state injected (enabled=%s, toggle=%s)",
|
|
enabled, is_toggle)
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("zone-rec-state inject failed: %s", exc)
|
|
|
|
# ─── navigation-target announcer (N3 "go here") ───────
|
|
# When the operator sets a destination, tell Gemini which place to go to
|
|
# and show it the reference photo(s). Actual robot motion is wired by N2;
|
|
# this establishes the goal + visual reference.
|
|
|
|
async def _announce_nav_target(self, session: Any,
|
|
zone_id: int, place_id: int) -> None:
|
|
if not zone_id or not place_id:
|
|
try:
|
|
await session.send_realtime_input(text=(
|
|
"[DESTINATION CLEARED] You have no specific destination right "
|
|
"now. Do not announce this unprompted."
|
|
))
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("nav-clear inject failed: %s", exc)
|
|
return
|
|
try:
|
|
from Project.Sanad.vision.zone_gallery import ZoneGallery
|
|
gallery = ZoneGallery(_ZONES_DIR)
|
|
place = gallery.get_place(zone_id, place_id)
|
|
zone = gallery.get_zone(zone_id)
|
|
except Exception as exc:
|
|
log.warning("nav-target resolve failed: %s", exc)
|
|
return
|
|
if place is None:
|
|
log.info("nav-target zone_%d/place_%d not found — skipping", zone_id, place_id)
|
|
return
|
|
place_name = place.name or f"place {place_id}"
|
|
zone_name = (zone.name if zone else None) or f"zone {zone_id}"
|
|
instr = (
|
|
f"[GO HERE] The user has set your destination to '{place_name}' in "
|
|
f"'{zone_name}'."
|
|
)
|
|
if place.description:
|
|
instr += f" Notes: {place.description}."
|
|
instr += (
|
|
" The image(s) below show what it looks like so you can recognise it. "
|
|
"If walking is enabled you will head there; if it is off, tell the "
|
|
"user to enable movement from the dashboard. Briefly acknowledge the "
|
|
"destination in your normal Khaleeji style."
|
|
)
|
|
parts: list[dict[str, Any]] = [{"text": instr}]
|
|
for p in place.sample_paths[:_FACES_MAX_SAMPLES]:
|
|
try:
|
|
raw = p.read_bytes()
|
|
except OSError:
|
|
continue
|
|
jpeg = gallery._resize_for_primer(raw, _FACES_PRIMER_RESIZE) or raw
|
|
parts.append({"inline_data": {"mime_type": "image/jpeg", "data": jpeg}})
|
|
try:
|
|
await session.send_client_content(
|
|
turns=[{"role": "user", "parts": parts}], turn_complete=True,
|
|
)
|
|
log.info("nav-target injected → zone_%d/place_%d (%s)",
|
|
zone_id, place_id, place_name)
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("nav-target inject failed: %s", exc)
|
|
|
|
# ─── movement-state announcer (N2) ────────────────────
|
|
# Spoken confirmation when the operator enables / disables Gemini-driven
|
|
# locomotion from the dashboard. The actual movement dispatch loop lives
|
|
# in the parent; this only gives the user audible feedback on the toggle.
|
|
|
|
async def _announce_movement_state(self, session: Any, enabled: bool,
|
|
is_toggle: bool) -> None:
|
|
if is_toggle and enabled:
|
|
text = (
|
|
"[MOVEMENT ON] Walking is now enabled — you can move when the "
|
|
"user asks. Briefly tell the user movement is enabled and they "
|
|
"can ask you to walk, in your normal Khaleeji style."
|
|
)
|
|
elif is_toggle and not enabled:
|
|
text = (
|
|
"[MOVEMENT OFF] Walking was just disabled — you must not move. "
|
|
"Briefly tell the user movement is now off. If they ask you to "
|
|
"walk, tell them to enable movement from the dashboard first."
|
|
)
|
|
elif enabled: # session start, movement already on
|
|
text = (
|
|
"[MOVEMENT STATUS] Walking is ON — you may move when asked. Do "
|
|
"not announce this unprompted."
|
|
)
|
|
else: # session start, movement off
|
|
text = (
|
|
"[MOVEMENT STATUS] Walking is OFF — you cannot move right now. "
|
|
"If the user asks you to walk, tell them to enable movement "
|
|
"from the dashboard. Do not announce this unprompted."
|
|
)
|
|
try:
|
|
await session.send_realtime_input(text=text)
|
|
log.info("movement-state injected (enabled=%s, toggle=%s)",
|
|
enabled, is_toggle)
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("movement-state inject failed: %s", exc)
|
|
|
|
# ─── audio profile hot-swap ───────────────────────────
|
|
# The parent (GeminiSubprocess) polls pactl for the Anker USB device
|
|
# and writes "profile:<json>" lines to our stdin. _stdin_watcher parses
|
|
# them into _PROFILE_PENDING; this loop drains the flag on the asyncio
|
|
# loop and performs the actual swap. The brain's read/write sites
|
|
# (_send_mic_loop / _receive_loop) keep using self._mic / self._speaker —
|
|
# an atomic ref reassignment is enough because nothing caches them in
|
|
# a loop-local variable (verified in exploration).
|
|
|
|
async def _audio_swap_loop(self, session: Any) -> None:
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
await asyncio.sleep(0.25)
|
|
with _PROFILE_LOCK:
|
|
target = _PROFILE_PENDING.get("id")
|
|
reason = _PROFILE_PENDING.get("reason", "")
|
|
_PROFILE_PENDING["id"] = None
|
|
_PROFILE_PENDING["reason"] = ""
|
|
if not target or target == self._current_profile_id:
|
|
continue
|
|
try:
|
|
await self.swap_audio_devices(session, target, reason=reason)
|
|
except asyncio.CancelledError:
|
|
return
|
|
except Exception as exc:
|
|
log.warning("audio swap failed: %s", exc)
|
|
|
|
async def swap_audio_devices(self, session: Any, profile_id: str,
|
|
reason: str = "") -> None:
|
|
"""Hot-swap mic+speaker to `profile_id` without dropping the live
|
|
Gemini session. Idempotent (no-op if already on `profile_id`).
|
|
|
|
Order matters: start the new mic BEFORE we tear the old one down,
|
|
so a transient PyAudio failure (e.g. udev hasn't exposed Anker yet)
|
|
leaves the old backend in place. After 3 retries with backoff we
|
|
give up and log WARN — the watcher will retry on its next tick.
|
|
"""
|
|
if self._swap_lock is None:
|
|
log.warning("swap requested before session loop started — skipping")
|
|
return
|
|
async with self._swap_lock:
|
|
if profile_id == self._current_profile_id:
|
|
return
|
|
prev = self._current_profile_id
|
|
log.info("audio swap: %s → %s (reason=%s)", prev, profile_id, reason or "—")
|
|
|
|
# Build + start the new mic. Retry: pactl can see the device
|
|
# before PyAudio's get_device_count refreshes.
|
|
try:
|
|
from Project.Sanad.voice.audio_io import AudioIO as _AudioIO
|
|
except Exception as exc:
|
|
log.error("audio swap: AudioIO import failed: %s", exc)
|
|
return
|
|
new_mic = new_spk = None
|
|
last_exc: Optional[BaseException] = None
|
|
for attempt in range(3):
|
|
try:
|
|
new_mic, new_spk = _AudioIO.build_backends(
|
|
profile_id, audio_client=self._audio_client,
|
|
)
|
|
# mic.start() opens PyAudio + spawns reader thread.
|
|
# speaker is lazy (opens on first send_chunk).
|
|
await asyncio.to_thread(new_mic.start)
|
|
break
|
|
except Exception as exc:
|
|
last_exc = exc
|
|
# Tear down a partially-built backend so the next attempt
|
|
# gets a clean slate; don't leak PyAudio handles.
|
|
if new_mic is not None:
|
|
try:
|
|
await asyncio.to_thread(new_mic.stop)
|
|
except Exception:
|
|
pass
|
|
new_mic = new_spk = None
|
|
log.info("audio swap attempt %d failed: %s — retry in 0.4s",
|
|
attempt + 1, exc)
|
|
await asyncio.sleep(0.4)
|
|
if new_mic is None or new_spk is None:
|
|
log.warning("audio swap %s → %s: all 3 attempts failed (%s); "
|
|
"keeping current profile",
|
|
prev, profile_id, last_exc)
|
|
return
|
|
|
|
# Drain the old playback so any in-flight AI utterance stops
|
|
# (interrupts mid-word — acceptable per spec, <1s gap).
|
|
# MUST be awaited via to_thread: _PyAudioSpeaker.stop now
|
|
# takes a per-instance RLock and an in-flight send_chunk on
|
|
# the executor may be holding it across a back-pressured
|
|
# PortAudio write. Calling stop() synchronously on the
|
|
# event-loop thread would wedge the entire loop (mic,
|
|
# vision, session.receive) until the pulse buffer drains.
|
|
try:
|
|
await asyncio.to_thread(self._speaker.stop)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
self._mic.flush()
|
|
except Exception:
|
|
pass
|
|
|
|
# Atomic ref swap — next read_chunk / send_chunk hits new.
|
|
old_mic, old_spk = self._mic, self._speaker
|
|
self._mic = new_mic
|
|
self._speaker = new_spk
|
|
self._current_profile_id = profile_id
|
|
self._reset_turn_state()
|
|
|
|
# Tear down old AFTER the ref swap so any executor call still
|
|
# in flight finishes against the old handle and the next loop
|
|
# iteration picks up the new one cleanly.
|
|
try:
|
|
await asyncio.to_thread(old_mic.stop)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
await asyncio.to_thread(old_spk.stop)
|
|
except Exception:
|
|
pass
|
|
|
|
# Silent context to Gemini — so it knows the input chain changed
|
|
# if asked (matches the _announce_vision_state pattern).
|
|
try:
|
|
await session.send_realtime_input(text=(
|
|
f"[AUDIO SWITCH] Mic + speaker are now on the {profile_id!s} "
|
|
f"audio profile. Do not announce this unprompted; just keep "
|
|
f"replying normally — the user's voice may sound clearer or "
|
|
f"different on the new device."
|
|
))
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as exc:
|
|
log.warning("audio-swap announce failed: %s", exc)
|
|
log.info("audio swap complete: %s → %s", prev, profile_id)
|
|
|
|
# ─── recognition state watcher ────────────────────────
|
|
# Polls data/.recognition_state.json at SANAD_RECOGNITION_POLL_S Hz and
|
|
# mirrors vision_enabled / face_rec_enabled into in-memory flags so the
|
|
# rest of the session can react WITHOUT a Gemini reconnect.
|
|
|
|
async def _recognition_state_watcher(self, session: Any) -> None:
|
|
last_mtime = 0.0
|
|
last_state = _recog_state.RecognitionState(
|
|
vision_enabled=self._vision_enabled,
|
|
face_rec_enabled=self._face_rec_enabled,
|
|
gallery_version=self._gallery_version_primed,
|
|
zone_rec_enabled=self._zone_rec_enabled,
|
|
zones_version=self._zones_version_primed,
|
|
movement_enabled=self._movement_enabled,
|
|
)
|
|
# Best-effort initial primer if face_rec is already on at session start.
|
|
if self._face_rec_enabled and self._vision_enabled:
|
|
try:
|
|
cur = _recog_state.read(_RECOG_STATE_PATH)
|
|
await self._send_gallery_primer(session, cur.gallery_version)
|
|
except Exception as exc:
|
|
log.warning("initial gallery primer failed: %s", exc)
|
|
|
|
# N3 — initial zones primer if zone recognition is already on. Unlike
|
|
# faces this does NOT require vision: name+description-only places still
|
|
# give Gemini useful knowledge to talk about.
|
|
if self._zone_rec_enabled:
|
|
try:
|
|
cur = _recog_state.read(_RECOG_STATE_PATH)
|
|
await self._send_zone_primer(session, cur.zones_version)
|
|
except Exception as exc:
|
|
log.warning("initial zone primer failed: %s", exc)
|
|
|
|
# Tell Gemini the current camera + recognition + movement state at
|
|
# session start — silent standing context so "can you see me?" / "do
|
|
# you know who I am?" are answered honestly even if nothing is toggled.
|
|
await self._announce_vision_state(
|
|
session, self._vision_enabled, is_toggle=False,
|
|
)
|
|
await self._announce_facerec_state(
|
|
session, self._face_rec_enabled, is_toggle=False,
|
|
)
|
|
await self._announce_zonerec_state(
|
|
session, self._zone_rec_enabled, is_toggle=False,
|
|
)
|
|
await self._announce_movement_state(
|
|
session, self._movement_enabled, is_toggle=False,
|
|
)
|
|
# N3 — announce the active "go here" destination (if any). _nav_target
|
|
# was reset to (-1,-1) per session so this fires on every reconnect.
|
|
try:
|
|
cur = _recog_state.read(_RECOG_STATE_PATH)
|
|
nav = (cur.nav_target_zone_id, cur.nav_target_place_id)
|
|
if nav != self._nav_target:
|
|
await self._announce_nav_target(session, nav[0], nav[1])
|
|
self._nav_target = nav
|
|
except Exception as exc:
|
|
log.warning("initial nav-target announce failed: %s", exc)
|
|
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
await asyncio.sleep(_RECOG_POLL_S)
|
|
try:
|
|
st = _RECOG_STATE_PATH.stat()
|
|
except FileNotFoundError:
|
|
continue
|
|
except Exception:
|
|
continue
|
|
if st.st_mtime == last_mtime:
|
|
continue
|
|
last_mtime = st.st_mtime
|
|
new_state = _recog_state.read(_RECOG_STATE_PATH)
|
|
|
|
# Vision toggle — instant. Announce it out loud so Gemini reacts
|
|
# ("I can see you now" / "I can't see you anymore").
|
|
if new_state.vision_enabled != last_state.vision_enabled:
|
|
self._vision_enabled = new_state.vision_enabled
|
|
log.info("vision toggled → %s", self._vision_enabled)
|
|
await self._announce_vision_state(
|
|
session, self._vision_enabled, is_toggle=True,
|
|
)
|
|
|
|
# Face-rec toggle — announce it out loud. The OFF announcement
|
|
# also tells Gemini to disregard the gallery, so OFF takes effect
|
|
# immediately instead of lingering until the next reconnect.
|
|
if new_state.face_rec_enabled != last_state.face_rec_enabled:
|
|
self._face_rec_enabled = new_state.face_rec_enabled
|
|
if self._face_rec_enabled:
|
|
log.info("face rec enabled — announcing + sending primer")
|
|
else:
|
|
log.info("face rec disabled — telling Gemini to "
|
|
"disregard the gallery")
|
|
await self._announce_facerec_state(
|
|
session, self._face_rec_enabled, is_toggle=True,
|
|
)
|
|
|
|
# Conditions for re-priming:
|
|
# - face_rec just turned ON (no_face_rec_before)
|
|
# - gallery version bumped since the last primer
|
|
face_rec_just_on = (
|
|
new_state.face_rec_enabled and not last_state.face_rec_enabled
|
|
)
|
|
gallery_changed = (
|
|
new_state.gallery_version != self._gallery_version_primed
|
|
)
|
|
if (self._face_rec_enabled
|
|
and (face_rec_just_on or gallery_changed)
|
|
and self._vision_enabled):
|
|
try:
|
|
await self._send_gallery_primer(
|
|
session, new_state.gallery_version,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("gallery primer failed: %s", exc)
|
|
|
|
# N3 — zone-recognition toggle (announce out loud, like face-rec).
|
|
if new_state.zone_rec_enabled != last_state.zone_rec_enabled:
|
|
self._zone_rec_enabled = new_state.zone_rec_enabled
|
|
log.info("zone rec toggled → %s", self._zone_rec_enabled)
|
|
await self._announce_zonerec_state(
|
|
session, self._zone_rec_enabled, is_toggle=True,
|
|
)
|
|
|
|
# Re-prime zones when zone-rec just turned ON or the zones version
|
|
# bumped (any zone/place/face-link/photo CRUD). No vision needed.
|
|
zone_rec_just_on = (
|
|
new_state.zone_rec_enabled and not last_state.zone_rec_enabled
|
|
)
|
|
zones_changed = (
|
|
new_state.zones_version != self._zones_version_primed
|
|
)
|
|
if self._zone_rec_enabled and (zone_rec_just_on or zones_changed):
|
|
try:
|
|
await self._send_zone_primer(
|
|
session, new_state.zones_version,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("zone primer failed: %s", exc)
|
|
|
|
# N3 — "go here" destination changed (set or cleared). Announce +
|
|
# show the reference photo. Diffed against the announced tuple so a
|
|
# CRUD-only version bump above doesn't double-fire this.
|
|
nav = (new_state.nav_target_zone_id, new_state.nav_target_place_id)
|
|
if nav != self._nav_target:
|
|
self._nav_target = nav
|
|
await self._announce_nav_target(session, nav[0], nav[1])
|
|
|
|
# N2 — movement enable/disable toggle (spoken confirmation only).
|
|
if new_state.movement_enabled != last_state.movement_enabled:
|
|
self._movement_enabled = new_state.movement_enabled
|
|
log.info("movement toggled → %s", self._movement_enabled)
|
|
await self._announce_movement_state(
|
|
session, self._movement_enabled, is_toggle=True,
|
|
)
|
|
|
|
last_state = new_state
|
|
|
|
# ─── camera frame send loop ───────────────────────────
|
|
# Reads the latest JPEG from the _LATEST_FRAME cache (fed by the
|
|
# _stdin_watcher thread, which the GeminiSubprocess supervisor pushes
|
|
# 'frame:<b64>' lines into) and relays it to Gemini Live at
|
|
# _VISION_SEND_HZ. Only active when self._vision_enabled. Skips frames
|
|
# older than _VISION_STALE_MS so a stopped/unplugged camera doesn't
|
|
# waste tokens on a frozen scene.
|
|
|
|
async def _send_frame_loop(self, session: Any) -> None:
|
|
period = 1.0 / max(0.5, _VISION_SEND_HZ)
|
|
stale_s = _VISION_STALE_MS / 1000.0
|
|
backoff = 0.0
|
|
last_sent_ts = 0.0
|
|
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
await asyncio.sleep(max(period, backoff))
|
|
if not self._vision_enabled:
|
|
continue
|
|
with _LATEST_FRAME_LOCK:
|
|
data = _LATEST_FRAME.get("bytes")
|
|
ts = _LATEST_FRAME.get("ts", 0.0)
|
|
if not data:
|
|
continue
|
|
# Stale — supervisor stopped pushing (camera off / unplugged).
|
|
if (time.time() - ts) > stale_s:
|
|
continue
|
|
# De-dup — don't re-send a frame we already relayed.
|
|
if ts == last_sent_ts:
|
|
continue
|
|
try:
|
|
await session.send_realtime_input(
|
|
video=types.Blob(data=data, mime_type="image/jpeg"),
|
|
)
|
|
last_sent_ts = ts
|
|
backoff = 0.0
|
|
except asyncio.CancelledError:
|
|
return
|
|
except Exception as exc:
|
|
log.warning("frame send failed: %s", exc)
|
|
backoff = min(backoff * 2 + 0.5, 5.0)
|
|
|
|
# ─── motion-state inject loop ─────────────────────────
|
|
# Drains _STATE_PENDING (fed by the _stdin_watcher from 'state:' lines
|
|
# the supervisor pushes when the arm starts/finishes/errors a motion)
|
|
# and injects each as silent text context into the live session, so
|
|
# Gemini can answer "what are you doing?" honestly. Per persona, Gemini
|
|
# reads these for context but does not narrate them unprompted.
|
|
|
|
async def _send_state_loop(self, session: Any) -> None:
|
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
|
await asyncio.sleep(0.1)
|
|
with _STATE_LOCK:
|
|
if not _STATE_PENDING:
|
|
continue
|
|
pending = list(_STATE_PENDING)
|
|
_STATE_PENDING.clear()
|
|
for msg in pending:
|
|
try:
|
|
await session.send_realtime_input(text=msg)
|
|
log.info("STATE injected: %s", msg)
|
|
except asyncio.CancelledError:
|
|
return
|
|
except Exception as exc:
|
|
# Some SDK versions may not accept text on
|
|
# send_realtime_input — log once-ish and keep going;
|
|
# motion still works, only this context channel is lost.
|
|
log.warning("state inject failed: %s", exc)
|
|
|
|
# ─── face gallery primer ──────────────────────────────
|
|
# Builds one multimodal turn carrying the entire face gallery + a Khaleeji
|
|
# greeting instruction, and sends it via send_client_content. Gemini keeps
|
|
# this in session context until reconnect. Re-sent on gallery_version bumps.
|
|
|
|
async def _send_gallery_primer(self, session: Any, version: int) -> None:
|
|
try:
|
|
from Project.Sanad.vision.face_gallery import FaceGallery
|
|
except Exception as exc:
|
|
log.info("face gallery module unavailable: %s", exc)
|
|
return
|
|
|
|
gallery = FaceGallery(_FACES_DIR)
|
|
try:
|
|
entries = gallery.load_for_primer(
|
|
max_samples_per_face=_FACES_MAX_SAMPLES,
|
|
resize_long_side=_FACES_PRIMER_RESIZE,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("face gallery load failed: %s", exc)
|
|
return
|
|
|
|
if not entries:
|
|
log.info("face gallery empty — primer skipped (v.%d)", version)
|
|
self._gallery_version_primed = version
|
|
return
|
|
|
|
parts: list[dict[str, Any]] = [{
|
|
"text": (
|
|
"GALLERY PRIMER (do not reply to this turn). "
|
|
"Below are people you know. When the live camera shows one of "
|
|
"them, greet them warmly by name in UAE Khaleeji dialect "
|
|
"(for example: 'هلا والله يا كسام، شحالك؟'), and you may use "
|
|
"the notes about them to make the conversation personal. "
|
|
"For faces NOT in this gallery, welcome them as a guest "
|
|
"without inventing a name. Greet each person only once per "
|
|
"minute to avoid repetition."
|
|
),
|
|
}]
|
|
for entry, jpegs in entries:
|
|
label = (
|
|
f"This person is named {entry.name}."
|
|
if entry.name
|
|
else "This person's name is unknown — greet as guest."
|
|
)
|
|
if entry.description:
|
|
label += f" Notes about them: {entry.description}"
|
|
parts.append({"text": f"\n— {label}"})
|
|
for jpeg in jpegs:
|
|
parts.append({
|
|
"inline_data": {"mime_type": "image/jpeg", "data": jpeg},
|
|
})
|
|
|
|
try:
|
|
await session.send_client_content(
|
|
turns=[{"role": "user", "parts": parts}],
|
|
turn_complete=True,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("primer send failed: %s", exc)
|
|
return
|
|
self._gallery_version_primed = version
|
|
log.info("face gallery primed: %d person(s), v.%d", len(entries), version)
|
|
|
|
# ─── zones primer (N3) ────────────────────────────────
|
|
# One multimodal turn carrying every zone, its places (name + description +
|
|
# reference photos), and the people linked to each place. A place may have
|
|
# NO photos (name + description only), so empty image lists are tolerated.
|
|
|
|
async def _send_zone_primer(self, session: Any, version: int) -> None:
|
|
try:
|
|
from Project.Sanad.vision.zone_gallery import ZoneGallery
|
|
except Exception as exc:
|
|
log.info("zone gallery module unavailable: %s", exc)
|
|
return
|
|
|
|
gallery = ZoneGallery(_ZONES_DIR)
|
|
try:
|
|
entries = gallery.load_for_primer(
|
|
max_samples_per_place=_FACES_MAX_SAMPLES,
|
|
resize_long_side=_FACES_PRIMER_RESIZE,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("zone gallery load failed: %s", exc)
|
|
return
|
|
|
|
if not entries:
|
|
log.info("zone gallery empty — primer skipped (v.%d)", version)
|
|
self._zones_version_primed = version
|
|
return
|
|
|
|
# Resolve linked face ids → names once (cheap, small galleries).
|
|
face_names: dict[int, str] = {}
|
|
try:
|
|
from Project.Sanad.vision.face_gallery import FaceGallery
|
|
for fe in FaceGallery(_FACES_DIR).list():
|
|
if fe.name:
|
|
face_names[fe.id] = fe.name
|
|
except Exception:
|
|
pass
|
|
|
|
parts: list[dict[str, Any]] = [{
|
|
"text": (
|
|
"ZONES PRIMER (do not reply to this turn). Below are the zones "
|
|
"and places you know, with the people often found at each place. "
|
|
"Use them to answer where things are, to name a place when the "
|
|
"live camera shows one, and to make directions personal. Do not "
|
|
"invent zones or places that are not listed here."
|
|
),
|
|
}]
|
|
n_zones = n_places = 0
|
|
for zone, places in entries:
|
|
n_zones += 1
|
|
zhdr = f"\n# Zone: {zone.name or '(unnamed)'}"
|
|
if zone.description:
|
|
zhdr += f" — {zone.description}"
|
|
parts.append({"text": zhdr})
|
|
if not places:
|
|
parts.append({"text": " (no places yet)"})
|
|
for place, jpegs in places:
|
|
n_places += 1
|
|
label = f"\n - Place: {place.name or '(unnamed)'}"
|
|
if place.description:
|
|
label += f" — {place.description}"
|
|
people = [face_names[f] for f in place.face_ids if f in face_names]
|
|
if people:
|
|
label += f" | People often here: {', '.join(people)}"
|
|
parts.append({"text": label})
|
|
for jpeg in jpegs:
|
|
parts.append({
|
|
"inline_data": {"mime_type": "image/jpeg", "data": jpeg},
|
|
})
|
|
|
|
try:
|
|
await session.send_client_content(
|
|
turns=[{"role": "user", "parts": parts}],
|
|
turn_complete=True,
|
|
)
|
|
except Exception as exc:
|
|
log.warning("zone primer send failed: %s", exc)
|
|
return
|
|
self._zones_version_primed = version
|
|
log.info("zones primed: %d zone(s), %d place(s), v.%d",
|
|
n_zones, n_places, version)
|