Sanad/gemini/script.py

"""Gemini brain — live conversation loop using the google-genai SDK.

Implements the VoiceBrain contract documented in `voice/model_script.py`:

    __init__(audio_io, recorder, voice_name, system_prompt)
    async run()
    stop()

Owns everything Gemini-specific: the `genai.Client`, `LiveConnectConfig`,
the session connect/receive loop, VAD-based barge-in, echo suppression,
reconnect backoff. Hardware I/O is delegated to `audio_io` and per-turn
WAV capture to `recorder` — both are model-agnostic.

Env overrides:
    SANAD_GEMINI_MODEL   — Gemini Live model id (without "models/" prefix)
"""

from __future__ import annotations

import array
import asyncio
import os
import time
from typing import Any, Optional

import numpy as np

from google import genai
from google.genai import types

from Project.Sanad.config import (
    CHUNK_SIZE,
    GEMINI_API_KEY,
    GEMINI_VOICE,
    RECEIVE_SAMPLE_RATE,
    SEND_SAMPLE_RATE,
)
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger

log = get_logger("gemini_brain")

_SV = _cfg_section("voice", "sanad_voice")
_VAD = _cfg_section("voice", "vad")
_BI = _cfg_section("voice", "barge_in")

_MODEL = os.environ.get(
    "SANAD_GEMINI_MODEL",
    "gemini-2.5-flash-native-audio-preview-12-2025",
)
_MIC_GAIN = _SV.get("mic_gain", 1.0)
_SESSION_TIMEOUT = _SV.get("session_timeout_sec", 660)
_MAX_RECONNECT_DELAY = _SV.get("max_reconnect_delay_sec", 30)
_MAX_CONSECUTIVE_ERRORS = _SV.get("max_consecutive_errors", 10)
_NO_MESSAGES_TIMEOUT = _SV.get("no_messages_timeout_sec", 30)

_CHUNK_BYTES = CHUNK_SIZE * 2
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES


def _audio_energy(pcm: bytes) -> int:
    try:
        samples = array.array("h", pcm)
        return sum(abs(s) for s in samples) // len(samples) if samples else 0
    except Exception:
        return 0


class GeminiBrain:
    """Gemini Live conversation brain — reconnect-safe."""

    def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
                 system_prompt: str = ""):
        self._audio = audio_io
        self._mic = audio_io.mic
        self._speaker = audio_io.speaker
        self._recorder = recorder
        self._voice = voice_name or GEMINI_VOICE
        self._system_prompt = system_prompt
        self._api_key = GEMINI_API_KEY
        self._stop_flag = asyncio.Event()
        # per-session state (reset in the outer reconnect loop)
        self._speaking = False
        self._stream_started = False
        self._barge_block_until = 0.0
        self._ai_speak_start = 0.0
        self._last_ai_audio = 0.0
        self._done: Optional[asyncio.Event] = None

    def stop(self) -> None:
        """Signal the run loop to exit at the next opportunity."""
        try:
            self._stop_flag.set()
        except Exception:
            pass

    # ─── public entry point ───────────────────────────────

    async def run(self) -> None:
        client = genai.Client(api_key=self._api_key)
        config = self._build_config()
        session_num = 0
        start_time = time.time()
        consecutive_errors = 0

        while not self._stop_flag.is_set():
            session_num += 1
            self._reset_turn_state()
            uptime_min = (time.time() - start_time) / 60

            try:
                log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
                         session_num, uptime_min)
                async with client.aio.live.connect(model=_MODEL, config=config) as session:
                    log.info("connected — speak anytime!")
                    consecutive_errors = 0
                    self._mic.flush()
                    self._done = asyncio.Event()

                    try:
                        await asyncio.wait_for(
                            asyncio.gather(
                                self._send_mic_loop(session),
                                self._receive_loop(session),
                            ),
                            timeout=_SESSION_TIMEOUT,
                        )
                    except asyncio.TimeoutError:
                        log.warning("session timed out after %ds", _SESSION_TIMEOUT)
                    except asyncio.CancelledError:
                        log.warning("session cancelled")

                    log.info("session #%d ended — reconnecting in 1s", session_num)
                    self._speaker.stop()
                    self._mic.flush()
                    await asyncio.sleep(1)

            except asyncio.CancelledError:
                log.info("cancelled — stopping")
                break
            except KeyboardInterrupt:
                log.info("keyboard interrupt — stopping")
                break
            except Exception as exc:
                consecutive_errors += 1
                delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
                log.error("session error (#%d): %s — reconnecting in %ds",
                          consecutive_errors, exc, delay)
                await asyncio.sleep(delay)
                if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
                    log.warning("%d consecutive errors — recreating client",
                                consecutive_errors)
                    try:
                        client = genai.Client(api_key=self._api_key)
                        consecutive_errors = 0
                    except Exception as ce:
                        log.error("client recreation failed: %s", ce)

    # ─── Gemini config ────────────────────────────────────

    def _build_config(self) -> types.LiveConnectConfig:
        return types.LiveConnectConfig(
            response_modalities=["AUDIO"],
            speech_config=types.SpeechConfig(
                voice_config=types.VoiceConfig(
                    prebuilt_voice_config=types.PrebuiltVoiceConfig(
                        voice_name=self._voice,
                    ),
                ),
            ),
            realtime_input_config=types.RealtimeInputConfig(
                automatic_activity_detection=types.AutomaticActivityDetection(
                    disabled=False,
                    start_of_speech_sensitivity=getattr(
                        types.StartSensitivity,
                        _VAD.get("start_sensitivity", "START_SENSITIVITY_HIGH"),
                    ),
                    end_of_speech_sensitivity=getattr(
                        types.EndSensitivity,
                        _VAD.get("end_sensitivity", "END_SENSITIVITY_LOW"),
                    ),
                    prefix_padding_ms=_VAD.get("prefix_padding_ms", 20),
                    silence_duration_ms=_VAD.get("silence_duration_ms", 200),
                ),
            ),
            input_audio_transcription=types.AudioTranscriptionConfig(),
            output_audio_transcription=types.AudioTranscriptionConfig(),
            system_instruction=types.Content(
                parts=[types.Part(text=self._system_prompt)],
            ),
        )

    # ─── state helpers ────────────────────────────────────

    def _reset_turn_state(self) -> None:
        self._speaking = False
        self._stream_started = False
        self._barge_block_until = 0.0
        self._ai_speak_start = 0.0
        self._last_ai_audio = 0.0

    def _interrupt(self, source: str = "local") -> None:
        self._speaking = False
        self._stream_started = False
        self._speaker.stop()
        self._mic.flush()
        self._recorder.finish_turn()
        log.info("interrupt (%s)", source)

    # ─── mic send loop ────────────────────────────────────

    async def _send_mic_loop(self, session: Any) -> None:
        threshold = _BI.get("threshold", 500)
        chunks_needed = _BI.get("loud_chunks_needed", 3)
        cooldown = _BI.get("cooldown_sec", 0.3)
        echo_suppress_below = _BI.get("echo_suppress_below", 500)
        grace = _BI.get("ai_speak_grace_sec", 0.15)

        loop = asyncio.get_event_loop()
        loud_count = 0
        last_activity = time.time()

        while not self._done.is_set() and not self._stop_flag.is_set():
            try:
                raw = await loop.run_in_executor(
                    None, self._mic.read_chunk, _CHUNK_BYTES,
                )
            except Exception:
                break

            samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
            samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
            data = samples.tobytes()
            energy = _audio_energy(data)
            now = time.time()

            # Barge-in: after AI starts speaking, sustained user energy cuts it.
            if self._speaking and now >= self._barge_block_until:
                if (now - self._ai_speak_start) >= grace:
                    if energy > threshold:
                        loud_count += 1
                    else:
                        loud_count = max(0, loud_count - 1)
                    if loud_count > chunks_needed:
                        log.info("BARGE-IN (e=%d)", energy)
                        self._interrupt("barge-in")
                        loud_count = 0
                        self._barge_block_until = now + cooldown

            # Echo suppression: while AI is speaking, mask quiet frames so the
            # mic doesn't feed the model its own voice bleed.
            send_data = data
            if self._speaking and energy < echo_suppress_below:
                send_data = _SILENCE_PCM

            # Record user audio when clearly speaking and AI isn't.
            if energy > 250 and not self._speaking:
                self._recorder.capture_user(data)

            # Keep-alive watchdog
            if energy > 250:
                last_activity = now
            elif now - last_activity > 10:
                log.info("alive (no speech %.0fs, e=%d)",
                         now - last_activity, energy)
                last_activity = now

            try:
                await session.send_realtime_input(
                    audio=types.Blob(
                        data=send_data,
                        mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
                    ),
                )
            except asyncio.CancelledError:
                return
            except Exception as exc:
                log.warning("mic send failed: %s — ending session", exc)
                self._done.set()
                return

            await asyncio.sleep(CHUNK_SIZE / SEND_SAMPLE_RATE)

        log.info("send_mic task ended")

    # ─── receive loop ─────────────────────────────────────

    async def _receive_loop(self, session: Any) -> None:
        loop = asyncio.get_event_loop()
        try:
            last_recv = time.time()
            while not self._done.is_set() and not self._stop_flag.is_set():
                async for response in session.receive():
                    last_recv = time.time()
                    if self._done.is_set():
                        break

                    if hasattr(response, "go_away") and response.go_away is not None:
                        log.info("server going away — will reconnect")
                        self._done.set()
                        return

                    sc = response.server_content
                    if sc is None:
                        continue

                    if sc.interrupted is True:
                        if self._speaking:
                            log.info("Gemini interrupted")
                            self._interrupt("gemini")
                        continue

                    if sc.input_transcription:
                        text = (sc.input_transcription.text or "").strip()
                        if text and not self._speaking:
                            log.info("USER: %s", text)
                            self._recorder.add_user_text(text)

                    if sc.output_transcription:
                        text = (sc.output_transcription.text or "").strip()
                        if text:
                            log.info("BOT : %s", text)
                            self._recorder.add_robot_text(text)

                    if sc.model_turn:
                        for part in sc.model_turn.parts:
                            if part.inline_data and part.inline_data.data:
                                now = time.time()
                                if not self._speaking:
                                    self._ai_speak_start = now
                                    self._speaking = True
                                self._last_ai_audio = now
                                raw_audio = part.inline_data.data
                                self._recorder.capture_robot(raw_audio)
                                audio = np.frombuffer(raw_audio, dtype=np.int16)
                                if not self._stream_started:
                                    await loop.run_in_executor(
                                        None, self._speaker.begin_stream,
                                    )
                                    self._stream_started = True
                                await loop.run_in_executor(
                                    None, self._speaker.send_chunk,
                                    audio, RECEIVE_SAMPLE_RATE,
                                )

                    if sc.turn_complete:
                        if (self._speaking and self._stream_started
                                and not self._speaker.interrupted):
                            log.info("speaker %.1fs", self._speaker.total_sent_sec)
                            await loop.run_in_executor(
                                None, self._speaker.wait_finish,
                            )
                        elif self._speaking and self._speaker.interrupted:
                            log.info("speaker interrupted")
                        self._speaking = False
                        self._stream_started = False
                        self._mic.flush()
                        self._recorder.finish_turn()
                        log.info("listening")

                if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
                    log.warning("no messages from Gemini for %ds — session dead",
                                _NO_MESSAGES_TIMEOUT)
                    break
                await asyncio.sleep(0.1)

        except Exception as exc:
            log.warning("receive ended: %s", exc)
        finally:
            self._done.set()