Manual_Photographer/gemini_voice.py

# gemini_voice.py
import asyncio
import base64
import json
import time
import array
import inspect

import pyaudio
import websockets
from Logger import Logs

FORMAT = pyaudio.paInt16
CHANNELS = 1
SEND_SAMPLE_RATE = 16000
RECEIVE_SAMPLE_RATE = 24000
CHUNK_SIZE = 256


class HamadGeminiVoice:
    def __init__(self):
        self.sanad_logger = Logs()
        self.sanad_logger.LogEngine("G1_Logs", "gemini_voice")


        self.audio_q = asyncio.Queue()
        self.speaking = False
        self.interrupted = False
        self.pya = pyaudio.PyAudio()

        self.MIN_THRESHOLD = 3000
        self.barge_in_threshold = 3000
        self.REQUIRED_LOUD_CHUNKS = 5

        self.PREBUFFER_CHUNKS = 1
        self.PLAYBACK_TIMEOUT = 0.2
        self.BARGE_IN_COOLDOWN = 0.7
        self.AI_SPEAK_GRACE = 0.25

        self._last_ai_audio_time = 0.0
        self._ai_speaking_since = 0.0
        self._barge_in_block_until = 0.0

        self.ECHO_GUARD_SEC = 0.8
        self._ignore_input_until = 0.0
        self.SEND_SILENCE_WHEN_SPEAKING = True
        self.SPEAKING_ENERGY_GATE = 0.85
        self._silence_pcm = b"\x00" * (CHUNK_SIZE * 2)

    def audio_energy(self, pcm):
        try:
            samples = array.array("h", pcm)
            if not samples:
                return 0
            return sum(abs(s) for s in samples) // len(samples)
        except Exception:
            return 0

    def calibrate_mic(self):
        self.sanad_logger.print_and_log("\n🤫 Calibrating Microphone... (Please remain silent)", message_type="info")
        try:
            stream = self.pya.open(
                format=FORMAT,
                channels=CHANNELS,
                rate=SEND_SAMPLE_RATE,
                input=True,
                frames_per_buffer=CHUNK_SIZE,
            )
            values = []
            for _ in range(40):
                data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
                values.append(self.audio_energy(data))

            stream.stop_stream()
            stream.close()

            avg_noise = sum(values) / len(values)
            self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)

            self.sanad_logger.print_and_log(f"✅ Baseline Noise: {avg_noise:.1f}", message_type="info")
            self.sanad_logger.print_and_log(f"✅ Interruption Threshold: {self.barge_in_threshold:.1f}", message_type="info")
        except Exception as e:
            self.sanad_logger.print_and_log(f"⚠️ Calibration failed: {e}. Using default threshold.", message_type="warning")

    def _ws_connect_kwargs(self):
        kwargs = {"max_size": None}
        try:
            sig = inspect.signature(websockets.connect)
            if "extra_headers" in sig.parameters:
                kwargs["extra_headers"] = {"Content-Type": "application/json"}
            else:
                kwargs["additional_headers"] = {"Content-Type": "application/json"}
        except Exception:
            kwargs["extra_headers"] = {"Content-Type": "application/json"}
        return kwargs

    async def send_text_prompt(self, ws, text: str):
        msg = {
            "clientContent": {
                "turns": [{"role": "user", "parts": [{"text": text}]}],
                "turnComplete": True
            }
        }
        await ws.send(json.dumps(msg))

    async def keepalive(self, ws, every_sec: float = 20.0):
        """
        Prevent idle disconnects: periodically send a websocket ping.
        """
        while True:
            try:
                await asyncio.sleep(every_sec)
                await ws.ping()
            except Exception:
                break

    async def capture_mic(self, ws):
        stream = await asyncio.to_thread(
            self.pya.open,
            format=FORMAT,
            channels=CHANNELS,
            rate=SEND_SAMPLE_RATE,
            input=True,
            frames_per_buffer=CHUNK_SIZE,
        )

        loud_chunks = 0
        while True:
            try:
                data = await asyncio.to_thread(stream.read, CHUNK_SIZE, exception_on_overflow=False)
                energy = self.audio_energy(data)
                now = time.time()

                if self.speaking and (now >= self._barge_in_block_until):
                    if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
                        loud_chunks = loud_chunks + 1 if energy > self.barge_in_threshold else 0
                        if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
                            self.sanad_logger.print_and_log(f"🛑 Interruption! (Energy: {energy})", message_type="warning")
                            self.interrupted = True
                            self.speaking = False
                            loud_chunks = 0
                            self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
                            while not self.audio_q.empty():
                                try:
                                    self.audio_q.get_nowait()
                                except asyncio.QueueEmpty:
                                    break

                data_to_send = data
                if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
                    gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
                    if energy < gate:
                        data_to_send = self._silence_pcm

                b64_audio = base64.b64encode(data_to_send).decode("utf-8")
                msg = {
                    "realtime_input": {
                        "media_chunks": [{
                            "data": b64_audio,
                            "mime_type": f"audio/pcm;rate={SEND_SAMPLE_RATE}",
                        }]
                    }
                }
                await ws.send(json.dumps(msg))

            except websockets.exceptions.ConnectionClosed:
                self.sanad_logger.print_and_log("⚠️ WebSocket closed.", message_type="warning")
                break
            except Exception as e:
                self.sanad_logger.print_and_log(f"❌ Mic Error: {e}", message_type="error")
                break

    async def receive_audio(self, ws):
        try:
            async for msg in ws:
                try:
                    response = json.loads(msg)
                    server_content = response.get("serverContent", {})

                    if server_content.get("interrupted"):
                        self.interrupted = False

                    if self.interrupted:
                        continue

                    model_turn = server_content.get("modelTurn")
                    if model_turn:
                        for part in model_turn.get("parts", []):
                            inline_data = part.get("inlineData")
                            if inline_data and inline_data.get("data"):
                                now = time.time()
                                if not self.speaking:
                                    self._ai_speaking_since = now
                                self.speaking = True
                                self._last_ai_audio_time = now
                                self._ignore_input_until = now + self.ECHO_GUARD_SEC

                                audio_bytes = base64.b64decode(inline_data["data"])
                                await self.audio_q.put(audio_bytes)

                except Exception as e:
                    self.sanad_logger.print_and_log(f"❌ Parse Error: {e}", message_type="error")
        except websockets.exceptions.ConnectionClosed:
            self.sanad_logger.print_and_log("⚠️ WebSocket closed.", message_type="warning")

    async def play_audio(self):
        stream = await asyncio.to_thread(
            self.pya.open,
            format=FORMAT,
            channels=CHANNELS,
            rate=RECEIVE_SAMPLE_RATE,
            output=True,
            frames_per_buffer=CHUNK_SIZE,
        )

        buffered = False
        while True:
            try:
                if self.interrupted:
                    await asyncio.sleep(0.01)
                    continue

                if self.speaking and not buffered:
                    while self.audio_q.qsize() < self.PREBUFFER_CHUNKS and self.speaking and not self.interrupted:
                        await asyncio.sleep(0.005)
                    buffered = True

                try:
                    data = await asyncio.wait_for(self.audio_q.get(), timeout=self.PLAYBACK_TIMEOUT)
                except asyncio.TimeoutError:
                    if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.20:
                        self.speaking = False
                        buffered = False
                    continue

                if data:
                    await asyncio.to_thread(stream.write, data)

                if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.20:
                    self.speaking = False
                    buffered = False

            except Exception as e:
                self.sanad_logger.print_and_log(f"❌ Speaker Error: {e}", message_type="error")
                break