Sanad_Package_1/vendor/Sanad/gemini/script.py

"""Gemini brain — live conversation loop using the google-genai SDK.

Implements the VoiceBrain contract documented in `voice/model_script.py`:

    __init__(audio_io, recorder, voice_name, system_prompt)
    async run()
    stop()

Owns everything Gemini-specific: the `genai.Client`, `LiveConnectConfig`,
the session connect/receive loop, VAD-based barge-in, echo suppression,
reconnect backoff. Hardware I/O is delegated to `audio_io` and per-turn
WAV capture to `recorder` — both are model-agnostic.

Env overrides:
    SANAD_GEMINI_MODEL   — Gemini Live model id (without "models/" prefix)
"""

from __future__ import annotations

import array
import asyncio
import base64
import json
import os
import sys
import threading
import time
from pathlib import Path
from typing import Any, Optional

import numpy as np

from google import genai
from google.genai import types

from Project.Sanad.config import (
    BASE_DIR,
    CHUNK_SIZE,
    GEMINI_API_KEY,
    GEMINI_VOICE,
    RECEIVE_SAMPLE_RATE,
    SEND_SAMPLE_RATE,
)
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
from Project.Sanad.vision import recognition_state as _recog_state

log = get_logger("gemini_brain")

_SV = _cfg_section("voice", "sanad_voice")
_VAD = _cfg_section("voice", "vad")
_BI = _cfg_section("voice", "barge_in")

_MODEL = os.environ.get(
    "SANAD_GEMINI_MODEL",
    "gemini-2.5-flash-native-audio-preview-12-2025",
)
_MIC_GAIN = _SV.get("mic_gain", 1.0)
_SESSION_TIMEOUT = _SV.get("session_timeout_sec", 660)
_MAX_RECONNECT_DELAY = _SV.get("max_reconnect_delay_sec", 30)
_MAX_CONSECUTIVE_ERRORS = _SV.get("max_consecutive_errors", 10)
_NO_MESSAGES_TIMEOUT = _SV.get("no_messages_timeout_sec", 30)

_CHUNK_BYTES = CHUNK_SIZE * 2
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES

# ── Recognition (camera + face gallery) tunables ──
_RECOG_STATE_PATH = Path(os.environ.get(
    "SANAD_RECOGNITION_STATE_PATH",
    str(BASE_DIR / "data" / ".recognition_state.json"),
))
_VISION_SEND_HZ = float(os.environ.get("SANAD_VISION_SEND_HZ", "2"))
_VISION_STALE_MS = int(os.environ.get("SANAD_VISION_STALE_MS", "1500"))
_RECOG_POLL_S = float(os.environ.get("SANAD_RECOGNITION_POLL_S", "1.0"))
_FACES_DIR = Path(os.environ.get(
    "SANAD_FACES_DIR",
    str(BASE_DIR / "data" / "faces"),
))
_FACES_MAX_SAMPLES = int(os.environ.get("SANAD_FACES_MAX_SAMPLES", "3"))
_FACES_PRIMER_RESIZE = int(os.environ.get("SANAD_FACES_PRIMER_RESIZE", "256"))
# N3 — zones gallery (zone → place → linked faces). Folded into a Gemini
# primer turn so Gemini can recognise / talk about known locations and the
# people associated with them.
_ZONES_DIR = Path(os.environ.get(
    "SANAD_ZONES_DIR",
    str(BASE_DIR / "data" / "zones"),
))


# ── stdin push channel (Marcus pattern) ──────────────────────
# The GeminiSubprocess supervisor writes two line types to this process's
# stdin:
#   "frame:<base64-jpeg>\n"   — a camera frame to relay to Gemini Live
#   "state:<json>\n"          — a motion-state update to inject as text
# A daemon thread parses them into the caches below; the asyncio tasks
# _send_frame_loop / _send_state_loop drain those caches.

_LATEST_FRAME_LOCK = threading.Lock()
_LATEST_FRAME: dict = {"bytes": None, "ts": 0.0}

_STATE_LOCK = threading.Lock()
_STATE_PENDING: list[str] = []

_STATE_TAGS = {
    "start":       "[STATE-START]",
    "complete":    "[STATE-DONE]",
    "interrupted": "[STATE-INTERRUPTED]",
    "error":       "[STATE-ERROR]",
    "paused":      "[STATE-PAUSED]",
    "resumed":     "[STATE-RESUMED]",
}

# Pending audio-profile swap signalled by the parent over "profile:" stdin
# lines. _audio_swap_loop drains it inside the brain's asyncio loop.
_PROFILE_LOCK = threading.Lock()
_PROFILE_PENDING: dict = {"id": None, "reason": ""}

_VALID_PROFILES = {"builtin", "anker", "hollyland_builtin"}


def _stdin_watcher() -> None:
    """Daemon thread — parse 'frame:' / 'state:' / 'profile:' lines off stdin.

    Best-effort: any malformed line is skipped. Exits when the parent
    closes our stdin (subprocess teardown)."""
    try:
        for line in sys.stdin:
            line = line.rstrip("\n")
            if not line:
                continue
            if line.startswith("frame:"):
                b64 = line[len("frame:"):]
                try:
                    data = base64.b64decode(b64)
                except Exception:
                    continue
                if data:
                    with _LATEST_FRAME_LOCK:
                        _LATEST_FRAME["bytes"] = data
                        _LATEST_FRAME["ts"] = time.time()
            elif line.startswith("state:"):
                try:
                    payload = json.loads(line[len("state:"):])
                except Exception:
                    continue
                event = (payload.get("event") or "").strip().lower()
                cmd = (payload.get("cmd") or "").strip()
                tag = _STATE_TAGS.get(event)
                if not tag or not cmd:
                    continue
                msg = f"{tag} {cmd}"
                elapsed = payload.get("elapsed_sec")
                if isinstance(elapsed, (int, float)):
                    msg += f" ({float(elapsed):.1f}s)"
                reason = payload.get("reason")
                if reason and event == "error":
                    msg += f" — {reason}"
                with _STATE_LOCK:
                    _STATE_PENDING.append(msg)
            elif line.startswith("profile:"):
                # Parent signals an audio-profile hot-swap. Stash the target;
                # _audio_swap_loop (asyncio task) handles the actual swap so
                # PyAudio open/close happens off the stdin thread.
                try:
                    payload = json.loads(line[len("profile:"):])
                except Exception:
                    continue
                pid = (payload.get("id") or "").strip().lower()
                if pid not in _VALID_PROFILES:
                    continue
                with _PROFILE_LOCK:
                    _PROFILE_PENDING["id"] = pid
                    _PROFILE_PENDING["reason"] = (
                        payload.get("reason") or "").strip()
    except Exception:
        return


# Start the watcher at import time — it blocks harmlessly on sys.stdin
# until the supervisor sends something. Daemon so it never blocks exit.
threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()


def _audio_energy(pcm: bytes) -> int:
    try:
        samples = array.array("h", pcm)
        return sum(abs(s) for s in samples) // len(samples) if samples else 0
    except Exception:
        return 0


class GeminiBrain:
    """Gemini Live conversation brain — reconnect-safe."""

    def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
                 system_prompt: str = ""):
        self._audio = audio_io
        self._mic = audio_io.mic
        self._speaker = audio_io.speaker
        # Kept on the brain so swap_audio_devices() can rebuild profiles that
        # need DDS (`builtin`, `hollyland_builtin`) without re-init'ing.
        self._audio_client = getattr(audio_io, "_audio_client", None)
        # Current profile id (driven by the parent's "profile:" stdin push).
        # Defaults to whatever audio_io was constructed with — `from_profile`
        # sets profile_id; if SANAD_AUDIO_PROFILE override is in env, that
        # value matches.
        self._current_profile_id = getattr(audio_io, "profile_id", None) \
            or os.environ.get("SANAD_AUDIO_PROFILE", "builtin").strip().lower()
        # Coordinates concurrent swap requests so two pending profile
        # changes don't interleave mid-tear-down.
        self._swap_lock: Optional[asyncio.Lock] = None  # built in run()
        self._recorder = recorder
        self._voice = voice_name or GEMINI_VOICE
        self._system_prompt = system_prompt
        self._api_key = GEMINI_API_KEY
        self._stop_flag = asyncio.Event()
        # per-session state (reset in the outer reconnect loop)
        self._speaking = False
        self._stream_started = False
        self._barge_block_until = 0.0
        self._ai_speak_start = 0.0
        self._last_ai_audio = 0.0
        self._done: Optional[asyncio.Event] = None
        # ── Recognition flags — kept in sync with the state file by
        # _recognition_state_watcher. Boot defaults come from the file (or
        # the SANAD_* env vars if the file is missing).
        _initial = _recog_state.read(_RECOG_STATE_PATH)
        self._vision_enabled = bool(
            _initial.vision_enabled
            or os.environ.get("SANAD_VISION_ENABLE", "0") == "1"
        )
        self._face_rec_enabled = bool(
            _initial.face_rec_enabled
            or os.environ.get("SANAD_FACE_RECOGNITION_ENABLE", "0") == "1"
        )
        self._gallery_version_primed = -1   # bumped after first successful primer
        # N3 — zones knowledge toggle + primer version tracking.
        self._zone_rec_enabled = bool(
            _initial.zone_rec_enabled
            or os.environ.get("SANAD_ZONE_RECOGNITION_ENABLE", "0") == "1"
        )
        self._zones_version_primed = -1
        # "Go here" destination already announced this session (zone_id, place_id).
        self._nav_target = (
            int(_initial.nav_target_zone_id), int(_initial.nav_target_place_id),
        )
        # N2 — Gemini-driven locomotion enable gate (announce only; the
        # actual dispatch loop lives in the parent and is wired separately).
        self._movement_enabled = bool(
            _initial.movement_enabled
            or os.environ.get("SANAD_MOVEMENT_ENABLE", "0") == "1"
        )

    def stop(self) -> None:
        """Signal the run loop to exit at the next opportunity."""
        try:
            self._stop_flag.set()
        except Exception:
            pass

    # ─── public entry point ───────────────────────────────

    async def run(self) -> None:
        client = genai.Client(api_key=self._api_key)
        config = self._build_config()
        session_num = 0
        start_time = time.time()
        consecutive_errors = 0

        while not self._stop_flag.is_set():
            session_num += 1
            self._reset_turn_state()
            uptime_min = (time.time() - start_time) / 60

            try:
                log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
                         session_num, uptime_min)
                async with client.aio.live.connect(model=_MODEL, config=config) as session:
                    log.info("connected — speak anytime!")
                    consecutive_errors = 0
                    self._mic.flush()
                    self._done = asyncio.Event()
                    # Reset per-session primer state so re-priming on reconnect
                    # actually happens. The state watcher will re-prime as soon
                    # as it sees vision+face-rec (and place-rec) enabled.
                    self._gallery_version_primed = -1
                    self._zones_version_primed = -1
                    # Re-announce the active destination on reconnect.
                    self._nav_target = (-1, -1)
                    # Lazy-build the swap lock on the active asyncio loop.
                    if self._swap_lock is None:
                        self._swap_lock = asyncio.Lock()

                    try:
                        await asyncio.wait_for(
                            asyncio.gather(
                                self._send_mic_loop(session),
                                self._receive_loop(session),
                                self._send_frame_loop(session),
                                self._send_state_loop(session),
                                self._recognition_state_watcher(session),
                                self._audio_swap_loop(session),
                            ),
                            timeout=_SESSION_TIMEOUT,
                        )
                    except asyncio.TimeoutError:
                        log.warning("session timed out after %ds", _SESSION_TIMEOUT)
                    except asyncio.CancelledError:
                        log.warning("session cancelled")

                    log.info("session #%d ended — reconnecting in 1s", session_num)
                    self._speaker.stop()
                    self._mic.flush()
                    await asyncio.sleep(1)

            except asyncio.CancelledError:
                log.info("cancelled — stopping")
                break
            except KeyboardInterrupt:
                log.info("keyboard interrupt — stopping")
                break
            except Exception as exc:
                consecutive_errors += 1
                delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
                log.error("session error (#%d): %s — reconnecting in %ds",
                          consecutive_errors, exc, delay)
                await asyncio.sleep(delay)
                if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
                    log.warning("%d consecutive errors — recreating client",
                                consecutive_errors)
                    try:
                        client = genai.Client(api_key=self._api_key)
                        consecutive_errors = 0
                    except Exception as ce:
                        log.error("client recreation failed: %s", ce)

    # ─── Gemini config ────────────────────────────────────

    def _build_config(self) -> types.LiveConnectConfig:
        return types.LiveConnectConfig(
            response_modalities=["AUDIO"],
            speech_config=types.SpeechConfig(
                voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(
                        voice_name=self._voice,
                    ),
                ),
            ),
            realtime_input_config=types.RealtimeInputConfig(
                automatic_activity_detection=types.AutomaticActivityDetection(
                    disabled=False,
                    start_of_speech_sensitivity=getattr(
                        types.StartSensitivity,
                        _VAD.get("start_sensitivity", "START_SENSITIVITY_HIGH"),
                    ),
                    end_of_speech_sensitivity=getattr(
                        types.EndSensitivity,
                        _VAD.get("end_sensitivity", "END_SENSITIVITY_LOW"),
                    ),
                    prefix_padding_ms=_VAD.get("prefix_padding_ms", 20),
                    silence_duration_ms=_VAD.get("silence_duration_ms", 200),
                ),
            ),
            input_audio_transcription=types.AudioTranscriptionConfig(),
            output_audio_transcription=types.AudioTranscriptionConfig(),
            system_instruction=types.Content(
                parts=[types.Part(text=self._system_prompt)],
            ),
        )

    # ─── state helpers ────────────────────────────────────

    def _reset_turn_state(self) -> None:
        self._speaking = False
        self._stream_started = False
        self._barge_block_until = 0.0
        self._ai_speak_start = 0.0
        self._last_ai_audio = 0.0

    def _interrupt(self, source: str = "local") -> None:
        self._speaking = False
        self._stream_started = False
        self._speaker.stop()
        self._mic.flush()
        self._recorder.finish_turn()
        log.info("interrupt (%s)", source)

    # ─── mic send loop ────────────────────────────────────

    async def _send_mic_loop(self, session: Any) -> None:
        threshold = _BI.get("threshold", 500)
        chunks_needed = _BI.get("loud_chunks_needed", 3)
        cooldown = _BI.get("cooldown_sec", 0.3)
        echo_suppress_below = _BI.get("echo_suppress_below", 500)
        grace = _BI.get("ai_speak_grace_sec", 0.15)

        loop = asyncio.get_event_loop()
        loud_count = 0
        last_activity = time.time()

        while not self._done.is_set() and not self._stop_flag.is_set():
            try:
                raw = await loop.run_in_executor(
                    None, self._mic.read_chunk, _CHUNK_BYTES,
                )
            except Exception:
                break

            samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
            samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
            data = samples.tobytes()
            energy = _audio_energy(data)
            now = time.time()

            # Barge-in: after AI starts speaking, sustained user energy cuts it.
            if self._speaking and now >= self._barge_block_until:
                if (now - self._ai_speak_start) >= grace:
                    if energy > threshold:
                        loud_count += 1
                    else:
                        loud_count = max(0, loud_count - 1)
                    if loud_count > chunks_needed:
                        log.info("BARGE-IN (e=%d)", energy)
                        self._interrupt("barge-in")
                        loud_count = 0
                        self._barge_block_until = now + cooldown

            # Echo suppression: while AI is speaking, mask quiet frames so the
            # mic doesn't feed the model its own voice bleed.
            send_data = data
            if self._speaking and energy < echo_suppress_below:
                send_data = _SILENCE_PCM

            # Record user audio when clearly speaking and AI isn't.
            if energy > 250 and not self._speaking:
                self._recorder.capture_user(data)

            # Keep-alive watchdog
            if energy > 250:
                last_activity = now
            elif now - last_activity > 10:
                log.info("alive (no speech %.0fs, e=%d)",
                         now - last_activity, energy)
                last_activity = now

            try:
                await session.send_realtime_input(
                    audio=types.Blob(
                        data=send_data,
                        mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
                    ),
                )
            except asyncio.CancelledError:
                return
            except Exception as exc:
                log.warning("mic send failed: %s — ending session", exc)
                self._done.set()
                return

            await asyncio.sleep(CHUNK_SIZE / SEND_SAMPLE_RATE)

        log.info("send_mic task ended")

    # ─── receive loop ─────────────────────────────────────

    async def _receive_loop(self, session: Any) -> None:
        loop = asyncio.get_event_loop()
        try:
            last_recv = time.time()
            while not self._done.is_set() and not self._stop_flag.is_set():
                async for response in session.receive():
                    last_recv = time.time()
                    if self._done.is_set():
                        break

                    if hasattr(response, "go_away") and response.go_away is not None:
                        log.info("server going away — will reconnect")
                        self._done.set()
                        return

                    sc = response.server_content
                    if sc is None:
                        continue

                    if sc.interrupted is True:
                        if self._speaking:
                            log.info("Gemini interrupted")
                            self._interrupt("gemini")
                        continue

                    if sc.input_transcription:
                        text = (sc.input_transcription.text or "").strip()
                        if text and not self._speaking:
                            log.info("USER: %s", text)
                            self._recorder.add_user_text(text)

                    if sc.output_transcription:
                        text = (sc.output_transcription.text or "").strip()
                        if text:
                            # Emit as "BOT:" (no space before colon) so the
                            # supervisor's _track_line can parse it the same
                            # way it parses "USER:" — this is the channel the
                            # movement dispatcher (N2) reads Gemini's own
                            # spoken phrases from. Keep in lock-step with
                            # GeminiSubprocess._track_line.
                            log.info("BOT: %s", text)
                            self._recorder.add_robot_text(text)

                    if sc.model_turn:
                        for part in sc.model_turn.parts:
                            if part.inline_data and part.inline_data.data:
                                now = time.time()
                                if not self._speaking:
                                    self._ai_speak_start = now
                                    self._speaking = True
                                self._last_ai_audio = now
                                raw_audio = part.inline_data.data
                                self._recorder.capture_robot(raw_audio)
                                audio = np.frombuffer(raw_audio, dtype=np.int16)
                                if not self._stream_started:
                                    await loop.run_in_executor(
                                        None, self._speaker.begin_stream,
                                    )
                                    self._stream_started = True
                                await loop.run_in_executor(
                                    None, self._speaker.send_chunk,
                                    audio, RECEIVE_SAMPLE_RATE,
                                )

                    if sc.turn_complete:
                        if (self._speaking and self._stream_started
                                and not self._speaker.interrupted):
                            log.info("speaker %.1fs", self._speaker.total_sent_sec)
                            await loop.run_in_executor(
                                None, self._speaker.wait_finish,
                            )
                        elif self._speaking and self._speaker.interrupted:
                            log.info("speaker interrupted")
                        self._speaking = False
                        self._stream_started = False
                        self._mic.flush()
                        self._recorder.finish_turn()
                        log.info("listening")

                if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
                    log.warning("no messages from Gemini for %ds — session dead",
                                _NO_MESSAGES_TIMEOUT)
                    break
                await asyncio.sleep(0.1)

        except Exception as exc:
            log.warning("receive ended: %s", exc)
        finally:
            self._done.set()

    # ─── vision-state announcer ───────────────────────────
    # Injects the camera state into the live session as text context.
    # On a live toggle Gemini is told to say so out loud ("I can see you
    # now" / "I can't see you anymore"); at session start it's silent
    # standing context so "can you see me?" is answered honestly.

    async def _announce_vision_state(self, session: Any, enabled: bool,
                                     is_toggle: bool) -> None:
        if is_toggle and enabled:
            text = (
                "[VISION ON] Your camera was just enabled — you can now see "
                "the user through it. Briefly tell them you can see them now, "
                "in your normal Khaleeji style (for example: "
                "'هلا، الحين أشوفك زين')."
            )
        elif is_toggle and not enabled:
            text = (
                "[VISION OFF] Your camera was just disabled — you can no "
                "longer see anything. Briefly tell the user you can't see "
                "them anymore. If they later ask whether you can see them, "
                "tell them to enable the camera from the dashboard."
            )
        elif enabled:  # session start, camera already on
            text = (
                "[VISION STATUS] Your camera is ON — you can see the user "
                "through it. Do not announce this unprompted; just answer "
                "naturally if they ask what you see."
            )
        else:  # session start, camera off
            text = (
                "[VISION STATUS] Your camera is OFF — you cannot see anything "
                "right now. If the user asks whether you can see them, tell "
                "them to enable the camera from the dashboard. Do not announce "
                "this unprompted."
            )
        try:
            await session.send_realtime_input(text=text)
            log.info("vision-state injected (enabled=%s, toggle=%s)",
                     enabled, is_toggle)
        except asyncio.CancelledError:
            raise
        except Exception as exc:
            log.warning("vision-state inject failed: %s", exc)

    # ─── face-recognition-state announcer ─────────────────
    # Same idea as _announce_vision_state, for the face-recognition toggle.
    # On a live OFF toggle it also tells Gemini to disregard the gallery —
    # so OFF takes effect immediately instead of lingering until reconnect.

    async def _announce_facerec_state(self, session: Any, enabled: bool,
                                      is_toggle: bool) -> None:
        if is_toggle and enabled:
            text = (
                "[FACE RECOGNITION ON] Face recognition was just enabled — "
                "you'll be shown the people you know in a moment. Briefly "
                "tell the user you can now recognise the people you know, in "
                "your normal Khaleeji style."
            )
        elif is_toggle and not enabled:
            text = (
                "[FACE RECOGNITION OFF] Face recognition was just disabled. "
                "Disregard the face gallery you were given earlier — stop "
                "greeting people by name and do not identify anyone. Briefly "
                "tell the user you'll no longer recognise faces."
            )
        elif enabled:  # session start, face rec already on
            text = (
                "[FACE RECOGNITION STATUS] Face recognition is ON — when you "
                "see someone you've been shown in the gallery, greet them by "
                "name. Do not announce this unprompted."
            )
        else:  # session start, face rec off
            text = (
                "[FACE RECOGNITION STATUS] Face recognition is OFF — you "
                "cannot identify people. If the user asks who someone is or "
                "whether you recognise them, tell them to enable face "
                "recognition from the dashboard. Do not announce this "
                "unprompted."
            )
        try:
            await session.send_realtime_input(text=text)
            log.info("face-rec-state injected (enabled=%s, toggle=%s)",
                     enabled, is_toggle)
        except asyncio.CancelledError:
            raise
        except Exception as exc:
            log.warning("face-rec-state inject failed: %s", exc)

    # ─── place-recognition-state announcer (N3) ───────────
    # Same idea as _announce_facerec_state, for the places-gallery toggle.
    # On a live OFF toggle it also tells Gemini to disregard the places it
    # was given so OFF takes effect immediately instead of lingering.

    async def _announce_zonerec_state(self, session: Any, enabled: bool,
                                      is_toggle: bool) -> None:
        if is_toggle and enabled:
            text = (
                "[ZONE RECOGNITION ON] You were just given the zones and places "
                "you know (and the people associated with them). Briefly tell "
                "the user you now know your way around, in your normal Khaleeji "
                "style."
            )
        elif is_toggle and not enabled:
            text = (
                "[ZONE RECOGNITION OFF] Zone recognition was just disabled. "
                "Disregard the zones and places you were given earlier — stop "
                "naming rooms or locations. Briefly tell the user you'll no "
                "longer recognise places."
            )
        elif enabled:  # session start, zone rec already on
            text = (
                "[ZONE RECOGNITION STATUS] Zone recognition is ON — when you see "
                "or are asked about a zone/place you've been told about, you may "
                "name it and use its description. Do not announce this "
                "unprompted."
            )
        else:  # session start, zone rec off
            text = (
                "[ZONE RECOGNITION STATUS] Zone recognition is OFF — you do not "
                "know any specific zones or places. If the user asks where they "
                "are or to go somewhere by name, tell them to enable zone "
                "recognition from the dashboard. Do not announce this "
                "unprompted."
            )
        try:
            await session.send_realtime_input(text=text)
            log.info("zone-rec-state injected (enabled=%s, toggle=%s)",
                     enabled, is_toggle)
        except asyncio.CancelledError:
            raise
        except Exception as exc:
            log.warning("zone-rec-state inject failed: %s", exc)

    # ─── navigation-target announcer (N3 "go here") ───────
    # When the operator sets a destination, tell Gemini which place to go to
    # and show it the reference photo(s). Actual robot motion is wired by N2;
    # this establishes the goal + visual reference.

    async def _announce_nav_target(self, session: Any,
                                   zone_id: int, place_id: int) -> None:
        if not zone_id or not place_id:
            try:
                await session.send_realtime_input(text=(
                    "[DESTINATION CLEARED] You have no specific destination right "
                    "now. Do not announce this unprompted."
                ))
            except asyncio.CancelledError:
                raise
            except Exception as exc:
                log.warning("nav-clear inject failed: %s", exc)
            return
        try:
            from Project.Sanad.vision.zone_gallery import ZoneGallery
            gallery = ZoneGallery(_ZONES_DIR)
            place = gallery.get_place(zone_id, place_id)
            zone = gallery.get_zone(zone_id)
        except Exception as exc:
            log.warning("nav-target resolve failed: %s", exc)
            return
        if place is None:
            log.info("nav-target zone_%d/place_%d not found — skipping", zone_id, place_id)
            return
        place_name = place.name or f"place {place_id}"
        zone_name = (zone.name if zone else None) or f"zone {zone_id}"
        instr = (
            f"[GO HERE] The user has set your destination to '{place_name}' in "
            f"'{zone_name}'."
        )
        if place.description:
            instr += f" Notes: {place.description}."
        instr += (
            " The image(s) below show what it looks like so you can recognise it. "
            "If walking is enabled you will head there; if it is off, tell the "
            "user to enable movement from the dashboard. Briefly acknowledge the "
            "destination in your normal Khaleeji style."
        )
        parts: list[dict[str, Any]] = [{"text": instr}]
        for p in place.sample_paths[:_FACES_MAX_SAMPLES]:
            try:
                raw = p.read_bytes()
            except OSError:
                continue
            jpeg = gallery._resize_for_primer(raw, _FACES_PRIMER_RESIZE) or raw
            parts.append({"inline_data": {"mime_type": "image/jpeg", "data": jpeg}})
        try:
            await session.send_client_content(
                turns=[{"role": "user", "parts": parts}], turn_complete=True,
            )
            log.info("nav-target injected → zone_%d/place_%d (%s)",
                     zone_id, place_id, place_name)
        except asyncio.CancelledError:
            raise
        except Exception as exc:
            log.warning("nav-target inject failed: %s", exc)

    # ─── movement-state announcer (N2) ────────────────────
    # Spoken confirmation when the operator enables / disables Gemini-driven
    # locomotion from the dashboard. The actual movement dispatch loop lives
    # in the parent; this only gives the user audible feedback on the toggle.

    async def _announce_movement_state(self, session: Any, enabled: bool,
                                       is_toggle: bool) -> None:
        if is_toggle and enabled:
            text = (
                "[MOVEMENT ON] Walking is now enabled — you can move when the "
                "user asks. Briefly tell the user movement is enabled and they "
                "can ask you to walk, in your normal Khaleeji style."
            )
        elif is_toggle and not enabled:
            text = (
                "[MOVEMENT OFF] Walking was just disabled — you must not move. "
                "Briefly tell the user movement is now off. If they ask you to "
                "walk, tell them to enable movement from the dashboard first."
            )
        elif enabled:  # session start, movement already on
            text = (
                "[MOVEMENT STATUS] Walking is ON — you may move when asked. Do "
                "not announce this unprompted."
            )
        else:  # session start, movement off
            text = (
                "[MOVEMENT STATUS] Walking is OFF — you cannot move right now. "
                "If the user asks you to walk, tell them to enable movement "
                "from the dashboard. Do not announce this unprompted."
            )
        try:
            await session.send_realtime_input(text=text)
            log.info("movement-state injected (enabled=%s, toggle=%s)",
                     enabled, is_toggle)
        except asyncio.CancelledError:
            raise
        except Exception as exc:
            log.warning("movement-state inject failed: %s", exc)

    # ─── audio profile hot-swap ───────────────────────────
    # The parent (GeminiSubprocess) polls pactl for the Anker USB device
    # and writes "profile:<json>" lines to our stdin. _stdin_watcher parses
    # them into _PROFILE_PENDING; this loop drains the flag on the asyncio
    # loop and performs the actual swap. The brain's read/write sites
    # (_send_mic_loop / _receive_loop) keep using self._mic / self._speaker —
    # an atomic ref reassignment is enough because nothing caches them in
    # a loop-local variable (verified in exploration).

    async def _audio_swap_loop(self, session: Any) -> None:
        while not self._done.is_set() and not self._stop_flag.is_set():
            await asyncio.sleep(0.25)
            with _PROFILE_LOCK:
                target = _PROFILE_PENDING.get("id")
                reason = _PROFILE_PENDING.get("reason", "")
                _PROFILE_PENDING["id"] = None
                _PROFILE_PENDING["reason"] = ""
            if not target or target == self._current_profile_id:
                continue
            try:
                await self.swap_audio_devices(session, target, reason=reason)
            except asyncio.CancelledError:
                return
            except Exception as exc:
                log.warning("audio swap failed: %s", exc)

    async def swap_audio_devices(self, session: Any, profile_id: str,
                                 reason: str = "") -> None:
        """Hot-swap mic+speaker to `profile_id` without dropping the live
        Gemini session. Idempotent (no-op if already on `profile_id`).

        Order matters: start the new mic BEFORE we tear the old one down,
        so a transient PyAudio failure (e.g. udev hasn't exposed Anker yet)
        leaves the old backend in place. After 3 retries with backoff we
        give up and log WARN — the watcher will retry on its next tick.
        """
        if self._swap_lock is None:
            log.warning("swap requested before session loop started — skipping")
            return
        async with self._swap_lock:
            if profile_id == self._current_profile_id:
                return
            prev = self._current_profile_id
            log.info("audio swap: %s → %s (reason=%s)", prev, profile_id, reason or "—")

            # Build + start the new mic. Retry: pactl can see the device
            # before PyAudio's get_device_count refreshes.
            try:
                from Project.Sanad.voice.audio_io import AudioIO as _AudioIO
            except Exception as exc:
                log.error("audio swap: AudioIO import failed: %s", exc)
                return
            new_mic = new_spk = None
            last_exc: Optional[BaseException] = None
            for attempt in range(3):
                try:
                    new_mic, new_spk = _AudioIO.build_backends(
                        profile_id, audio_client=self._audio_client,
                    )
                    # mic.start() opens PyAudio + spawns reader thread.
                    # speaker is lazy (opens on first send_chunk).
                    await asyncio.to_thread(new_mic.start)
                    break
                except Exception as exc:
                    last_exc = exc
                    # Tear down a partially-built backend so the next attempt
                    # gets a clean slate; don't leak PyAudio handles.
                    if new_mic is not None:
                        try:
                            await asyncio.to_thread(new_mic.stop)
                        except Exception:
                            pass
                    new_mic = new_spk = None
                    log.info("audio swap attempt %d failed: %s — retry in 0.4s",
                             attempt + 1, exc)
                    await asyncio.sleep(0.4)
            if new_mic is None or new_spk is None:
                log.warning("audio swap %s → %s: all 3 attempts failed (%s); "
                            "keeping current profile",
                            prev, profile_id, last_exc)
                return

            # Drain the old playback so any in-flight AI utterance stops
            # (interrupts mid-word — acceptable per spec, <1s gap).
            # MUST be awaited via to_thread: _PyAudioSpeaker.stop now
            # takes a per-instance RLock and an in-flight send_chunk on
            # the executor may be holding it across a back-pressured
            # PortAudio write. Calling stop() synchronously on the
            # event-loop thread would wedge the entire loop (mic,
            # vision, session.receive) until the pulse buffer drains.
            try:
                await asyncio.to_thread(self._speaker.stop)
            except Exception:
                pass
            try:
                self._mic.flush()
            except Exception:
                pass

            # Atomic ref swap — next read_chunk / send_chunk hits new.
            old_mic, old_spk = self._mic, self._speaker
            self._mic = new_mic
            self._speaker = new_spk
            self._current_profile_id = profile_id
            self._reset_turn_state()

            # Tear down old AFTER the ref swap so any executor call still
            # in flight finishes against the old handle and the next loop
            # iteration picks up the new one cleanly.
            try:
                await asyncio.to_thread(old_mic.stop)
            except Exception:
                pass
            try:
                await asyncio.to_thread(old_spk.stop)
            except Exception:
                pass

            # Silent context to Gemini — so it knows the input chain changed
            # if asked (matches the _announce_vision_state pattern).
            try:
                await session.send_realtime_input(text=(
                    f"[AUDIO SWITCH] Mic + speaker are now on the {profile_id!s} "
                    f"audio profile. Do not announce this unprompted; just keep "
                    f"replying normally — the user's voice may sound clearer or "
                    f"different on the new device."
                ))
            except asyncio.CancelledError:
                raise
            except Exception as exc:
                log.warning("audio-swap announce failed: %s", exc)
            log.info("audio swap complete: %s → %s", prev, profile_id)

    # ─── recognition state watcher ────────────────────────
    # Polls data/.recognition_state.json at SANAD_RECOGNITION_POLL_S Hz and
    # mirrors vision_enabled / face_rec_enabled into in-memory flags so the
    # rest of the session can react WITHOUT a Gemini reconnect.

    async def _recognition_state_watcher(self, session: Any) -> None:
        last_mtime = 0.0
        last_state = _recog_state.RecognitionState(
            vision_enabled=self._vision_enabled,
            face_rec_enabled=self._face_rec_enabled,
            gallery_version=self._gallery_version_primed,
            zone_rec_enabled=self._zone_rec_enabled,
            zones_version=self._zones_version_primed,
            movement_enabled=self._movement_enabled,
        )
        # Best-effort initial primer if face_rec is already on at session start.
        if self._face_rec_enabled and self._vision_enabled:
            try:
                cur = _recog_state.read(_RECOG_STATE_PATH)
                await self._send_gallery_primer(session, cur.gallery_version)
            except Exception as exc:
                log.warning("initial gallery primer failed: %s", exc)

        # N3 — initial zones primer if zone recognition is already on. Unlike
        # faces this does NOT require vision: name+description-only places still
        # give Gemini useful knowledge to talk about.
        if self._zone_rec_enabled:
            try:
                cur = _recog_state.read(_RECOG_STATE_PATH)
                await self._send_zone_primer(session, cur.zones_version)
            except Exception as exc:
                log.warning("initial zone primer failed: %s", exc)

        # Tell Gemini the current camera + recognition + movement state at
        # session start — silent standing context so "can you see me?" / "do
        # you know who I am?" are answered honestly even if nothing is toggled.
        await self._announce_vision_state(
            session, self._vision_enabled, is_toggle=False,
        )
        await self._announce_facerec_state(
            session, self._face_rec_enabled, is_toggle=False,
        )
        await self._announce_zonerec_state(
            session, self._zone_rec_enabled, is_toggle=False,
        )
        await self._announce_movement_state(
            session, self._movement_enabled, is_toggle=False,
        )
        # N3 — announce the active "go here" destination (if any). _nav_target
        # was reset to (-1,-1) per session so this fires on every reconnect.
        try:
            cur = _recog_state.read(_RECOG_STATE_PATH)
            nav = (cur.nav_target_zone_id, cur.nav_target_place_id)
            if nav != self._nav_target:
                await self._announce_nav_target(session, nav[0], nav[1])
                self._nav_target = nav
        except Exception as exc:
            log.warning("initial nav-target announce failed: %s", exc)

        while not self._done.is_set() and not self._stop_flag.is_set():
            await asyncio.sleep(_RECOG_POLL_S)
            try:
                st = _RECOG_STATE_PATH.stat()
            except FileNotFoundError:
                continue
            except Exception:
                continue
            if st.st_mtime == last_mtime:
                continue
            last_mtime = st.st_mtime
            new_state = _recog_state.read(_RECOG_STATE_PATH)

            # Vision toggle — instant. Announce it out loud so Gemini reacts
            # ("I can see you now" / "I can't see you anymore").
            if new_state.vision_enabled != last_state.vision_enabled:
                self._vision_enabled = new_state.vision_enabled
                log.info("vision toggled → %s", self._vision_enabled)
                await self._announce_vision_state(
                    session, self._vision_enabled, is_toggle=True,
                )

            # Face-rec toggle — announce it out loud. The OFF announcement
            # also tells Gemini to disregard the gallery, so OFF takes effect
            # immediately instead of lingering until the next reconnect.
            if new_state.face_rec_enabled != last_state.face_rec_enabled:
                self._face_rec_enabled = new_state.face_rec_enabled
                if self._face_rec_enabled:
                    log.info("face rec enabled — announcing + sending primer")
                else:
                    log.info("face rec disabled — telling Gemini to "
                             "disregard the gallery")
                await self._announce_facerec_state(
                    session, self._face_rec_enabled, is_toggle=True,
                )

            # Conditions for re-priming:
            #  - face_rec just turned ON (no_face_rec_before)
            #  - gallery version bumped since the last primer
            face_rec_just_on = (
                new_state.face_rec_enabled and not last_state.face_rec_enabled
            )
            gallery_changed = (
                new_state.gallery_version != self._gallery_version_primed
            )
            if (self._face_rec_enabled
                    and (face_rec_just_on or gallery_changed)
                    and self._vision_enabled):
                try:
                    await self._send_gallery_primer(
                        session, new_state.gallery_version,
                    )
                except Exception as exc:
                    log.warning("gallery primer failed: %s", exc)

            # N3 — zone-recognition toggle (announce out loud, like face-rec).
            if new_state.zone_rec_enabled != last_state.zone_rec_enabled:
                self._zone_rec_enabled = new_state.zone_rec_enabled
                log.info("zone rec toggled → %s", self._zone_rec_enabled)
                await self._announce_zonerec_state(
                    session, self._zone_rec_enabled, is_toggle=True,
                )

            # Re-prime zones when zone-rec just turned ON or the zones version
            # bumped (any zone/place/face-link/photo CRUD). No vision needed.
            zone_rec_just_on = (
                new_state.zone_rec_enabled and not last_state.zone_rec_enabled
            )
            zones_changed = (
                new_state.zones_version != self._zones_version_primed
            )
            if self._zone_rec_enabled and (zone_rec_just_on or zones_changed):
                try:
                    await self._send_zone_primer(
                        session, new_state.zones_version,
                    )
                except Exception as exc:
                    log.warning("zone primer failed: %s", exc)

            # N3 — "go here" destination changed (set or cleared). Announce +
            # show the reference photo. Diffed against the announced tuple so a
            # CRUD-only version bump above doesn't double-fire this.
            nav = (new_state.nav_target_zone_id, new_state.nav_target_place_id)
            if nav != self._nav_target:
                self._nav_target = nav
                await self._announce_nav_target(session, nav[0], nav[1])

            # N2 — movement enable/disable toggle (spoken confirmation only).
            if new_state.movement_enabled != last_state.movement_enabled:
                self._movement_enabled = new_state.movement_enabled
                log.info("movement toggled → %s", self._movement_enabled)
                await self._announce_movement_state(
                    session, self._movement_enabled, is_toggle=True,
                )

            last_state = new_state

    # ─── camera frame send loop ───────────────────────────
    # Reads the latest JPEG from the _LATEST_FRAME cache (fed by the
    # _stdin_watcher thread, which the GeminiSubprocess supervisor pushes
    # 'frame:<b64>' lines into) and relays it to Gemini Live at
    # _VISION_SEND_HZ. Only active when self._vision_enabled. Skips frames
    # older than _VISION_STALE_MS so a stopped/unplugged camera doesn't
    # waste tokens on a frozen scene.

    async def _send_frame_loop(self, session: Any) -> None:
        period = 1.0 / max(0.5, _VISION_SEND_HZ)
        stale_s = _VISION_STALE_MS / 1000.0
        backoff = 0.0
        last_sent_ts = 0.0

        while not self._done.is_set() and not self._stop_flag.is_set():
            await asyncio.sleep(max(period, backoff))
            if not self._vision_enabled:
                continue
            with _LATEST_FRAME_LOCK:
                data = _LATEST_FRAME.get("bytes")
                ts = _LATEST_FRAME.get("ts", 0.0)
            if not data:
                continue
            # Stale — supervisor stopped pushing (camera off / unplugged).
            if (time.time() - ts) > stale_s:
                continue
            # De-dup — don't re-send a frame we already relayed.
            if ts == last_sent_ts:
                continue
            try:
                await session.send_realtime_input(
                    video=types.Blob(data=data, mime_type="image/jpeg"),
                )
                last_sent_ts = ts
                backoff = 0.0
            except asyncio.CancelledError:
                return
            except Exception as exc:
                log.warning("frame send failed: %s", exc)
                backoff = min(backoff * 2 + 0.5, 5.0)

    # ─── motion-state inject loop ─────────────────────────
    # Drains _STATE_PENDING (fed by the _stdin_watcher from 'state:' lines
    # the supervisor pushes when the arm starts/finishes/errors a motion)
    # and injects each as silent text context into the live session, so
    # Gemini can answer "what are you doing?" honestly. Per persona, Gemini
    # reads these for context but does not narrate them unprompted.

    async def _send_state_loop(self, session: Any) -> None:
        while not self._done.is_set() and not self._stop_flag.is_set():
            await asyncio.sleep(0.1)
            with _STATE_LOCK:
                if not _STATE_PENDING:
                    continue
                pending = list(_STATE_PENDING)
                _STATE_PENDING.clear()
            for msg in pending:
                try:
                    await session.send_realtime_input(text=msg)
                    log.info("STATE injected: %s", msg)
                except asyncio.CancelledError:
                    return
                except Exception as exc:
                    # Some SDK versions may not accept text on
                    # send_realtime_input — log once-ish and keep going;
                    # motion still works, only this context channel is lost.
                    log.warning("state inject failed: %s", exc)

    # ─── face gallery primer ──────────────────────────────
    # Builds one multimodal turn carrying the entire face gallery + a Khaleeji
    # greeting instruction, and sends it via send_client_content. Gemini keeps
    # this in session context until reconnect. Re-sent on gallery_version bumps.

    async def _send_gallery_primer(self, session: Any, version: int) -> None:
        try:
            from Project.Sanad.vision.face_gallery import FaceGallery
        except Exception as exc:
            log.info("face gallery module unavailable: %s", exc)
            return

        gallery = FaceGallery(_FACES_DIR)
        try:
            entries = gallery.load_for_primer(
                max_samples_per_face=_FACES_MAX_SAMPLES,
                resize_long_side=_FACES_PRIMER_RESIZE,
            )
        except Exception as exc:
            log.warning("face gallery load failed: %s", exc)
            return

        if not entries:
            log.info("face gallery empty — primer skipped (v.%d)", version)
            self._gallery_version_primed = version
            return

        parts: list[dict[str, Any]] = [{
            "text": (
                "GALLERY PRIMER (do not reply to this turn). "
                "Below are people you know. When the live camera shows one of "
                "them, greet them warmly by name in UAE Khaleeji dialect "
                "(for example: 'هلا والله يا كسام، شحالك؟'), and you may use "
                "the notes about them to make the conversation personal. "
                "For faces NOT in this gallery, welcome them as a guest "
                "without inventing a name. Greet each person only once per "
                "minute to avoid repetition."
            ),
        }]
        for entry, jpegs in entries:
            label = (
                f"This person is named {entry.name}."
                if entry.name
                else "This person's name is unknown — greet as guest."
            )
            if entry.description:
                label += f" Notes about them: {entry.description}"
            parts.append({"text": f"\n— {label}"})
            for jpeg in jpegs:
                parts.append({
                    "inline_data": {"mime_type": "image/jpeg", "data": jpeg},
                })

        try:
            await session.send_client_content(
                turns=[{"role": "user", "parts": parts}],
                turn_complete=True,
            )
        except Exception as exc:
            log.warning("primer send failed: %s", exc)
            return
        self._gallery_version_primed = version
        log.info("face gallery primed: %d person(s), v.%d", len(entries), version)

    # ─── zones primer (N3) ────────────────────────────────
    # One multimodal turn carrying every zone, its places (name + description +
    # reference photos), and the people linked to each place. A place may have
    # NO photos (name + description only), so empty image lists are tolerated.

    async def _send_zone_primer(self, session: Any, version: int) -> None:
        try:
            from Project.Sanad.vision.zone_gallery import ZoneGallery
        except Exception as exc:
            log.info("zone gallery module unavailable: %s", exc)
            return

        gallery = ZoneGallery(_ZONES_DIR)
        try:
            entries = gallery.load_for_primer(
                max_samples_per_place=_FACES_MAX_SAMPLES,
                resize_long_side=_FACES_PRIMER_RESIZE,
            )
        except Exception as exc:
            log.warning("zone gallery load failed: %s", exc)
            return

        if not entries:
            log.info("zone gallery empty — primer skipped (v.%d)", version)
            self._zones_version_primed = version
            return

        # Resolve linked face ids → names once (cheap, small galleries).
        face_names: dict[int, str] = {}
        try:
            from Project.Sanad.vision.face_gallery import FaceGallery
            for fe in FaceGallery(_FACES_DIR).list():
                if fe.name:
                    face_names[fe.id] = fe.name
        except Exception:
            pass

        parts: list[dict[str, Any]] = [{
            "text": (
                "ZONES PRIMER (do not reply to this turn). Below are the zones "
                "and places you know, with the people often found at each place. "
                "Use them to answer where things are, to name a place when the "
                "live camera shows one, and to make directions personal. Do not "
                "invent zones or places that are not listed here."
            ),
        }]
        n_zones = n_places = 0
        for zone, places in entries:
            n_zones += 1
            zhdr = f"\n# Zone: {zone.name or '(unnamed)'}"
            if zone.description:
                zhdr += f" — {zone.description}"
            parts.append({"text": zhdr})
            if not places:
                parts.append({"text": "  (no places yet)"})
            for place, jpegs in places:
                n_places += 1
                label = f"\n  - Place: {place.name or '(unnamed)'}"
                if place.description:
                    label += f" — {place.description}"
                people = [face_names[f] for f in place.face_ids if f in face_names]
                if people:
                    label += f" | People often here: {', '.join(people)}"
                parts.append({"text": label})
                for jpeg in jpegs:
                    parts.append({
                        "inline_data": {"mime_type": "image/jpeg", "data": jpeg},
                    })

        try:
            await session.send_client_content(
                turns=[{"role": "user", "parts": parts}],
                turn_complete=True,
            )
        except Exception as exc:
            log.warning("zone primer send failed: %s", exc)
            return
        self._zones_version_primed = version
        log.info("zones primed: %d zone(s), %d place(s), v.%d",
                 n_zones, n_places, version)