# gemini_voice.py import asyncio import base64 import json import time import array import inspect import pyaudio import websockets from Logger import Logs FORMAT = pyaudio.paInt16 CHANNELS = 1 SEND_SAMPLE_RATE = 16000 RECEIVE_SAMPLE_RATE = 24000 CHUNK_SIZE = 256 class HamadGeminiVoice: def __init__(self): self.sanad_logger = Logs() self.sanad_logger.LogEngine("G1_Logs", "gemini_voice") self.audio_q = asyncio.Queue() self.speaking = False self.interrupted = False self.pya = pyaudio.PyAudio() self.MIN_THRESHOLD = 3000 self.barge_in_threshold = 3000 self.REQUIRED_LOUD_CHUNKS = 5 self.PREBUFFER_CHUNKS = 1 self.PLAYBACK_TIMEOUT = 0.2 self.BARGE_IN_COOLDOWN = 0.7 self.AI_SPEAK_GRACE = 0.25 self._last_ai_audio_time = 0.0 self._ai_speaking_since = 0.0 self._barge_in_block_until = 0.0 self.ECHO_GUARD_SEC = 0.8 self._ignore_input_until = 0.0 self.SEND_SILENCE_WHEN_SPEAKING = True self.SPEAKING_ENERGY_GATE = 0.85 self._silence_pcm = b"\x00" * (CHUNK_SIZE * 2) def audio_energy(self, pcm): try: samples = array.array("h", pcm) if not samples: return 0 return sum(abs(s) for s in samples) // len(samples) except Exception: return 0 def calibrate_mic(self): self.sanad_logger.print_and_log("\n🤫 Calibrating Microphone... (Please remain silent)", message_type="info") try: stream = self.pya.open( format=FORMAT, channels=CHANNELS, rate=SEND_SAMPLE_RATE, input=True, frames_per_buffer=CHUNK_SIZE, ) values = [] for _ in range(40): data = stream.read(CHUNK_SIZE, exception_on_overflow=False) values.append(self.audio_energy(data)) stream.stop_stream() stream.close() avg_noise = sum(values) / len(values) self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0) self.sanad_logger.print_and_log(f"āœ… Baseline Noise: {avg_noise:.1f}", message_type="info") self.sanad_logger.print_and_log(f"āœ… Interruption Threshold: {self.barge_in_threshold:.1f}", message_type="info") except Exception as e: self.sanad_logger.print_and_log(f"āš ļø Calibration failed: {e}. Using default threshold.", message_type="warning") def _ws_connect_kwargs(self): kwargs = {"max_size": None} try: sig = inspect.signature(websockets.connect) if "extra_headers" in sig.parameters: kwargs["extra_headers"] = {"Content-Type": "application/json"} else: kwargs["additional_headers"] = {"Content-Type": "application/json"} except Exception: kwargs["extra_headers"] = {"Content-Type": "application/json"} return kwargs async def send_text_prompt(self, ws, text: str): msg = { "clientContent": { "turns": [{"role": "user", "parts": [{"text": text}]}], "turnComplete": True } } await ws.send(json.dumps(msg)) async def keepalive(self, ws, every_sec: float = 20.0): """ Prevent idle disconnects: periodically send a websocket ping. """ while True: try: await asyncio.sleep(every_sec) await ws.ping() except Exception: break async def capture_mic(self, ws): stream = await asyncio.to_thread( self.pya.open, format=FORMAT, channels=CHANNELS, rate=SEND_SAMPLE_RATE, input=True, frames_per_buffer=CHUNK_SIZE, ) loud_chunks = 0 while True: try: data = await asyncio.to_thread(stream.read, CHUNK_SIZE, exception_on_overflow=False) energy = self.audio_energy(data) now = time.time() if self.speaking and (now >= self._barge_in_block_until): if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE: loud_chunks = loud_chunks + 1 if energy > self.barge_in_threshold else 0 if loud_chunks > self.REQUIRED_LOUD_CHUNKS: self.sanad_logger.print_and_log(f"šŸ›‘ Interruption! (Energy: {energy})", message_type="warning") self.interrupted = True self.speaking = False loud_chunks = 0 self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN while not self.audio_q.empty(): try: self.audio_q.get_nowait() except asyncio.QueueEmpty: break data_to_send = data if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking: gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE if energy < gate: data_to_send = self._silence_pcm b64_audio = base64.b64encode(data_to_send).decode("utf-8") msg = { "realtime_input": { "media_chunks": [{ "data": b64_audio, "mime_type": f"audio/pcm;rate={SEND_SAMPLE_RATE}", }] } } await ws.send(json.dumps(msg)) except websockets.exceptions.ConnectionClosed: self.sanad_logger.print_and_log("āš ļø WebSocket closed.", message_type="warning") break except Exception as e: self.sanad_logger.print_and_log(f"āŒ Mic Error: {e}", message_type="error") break async def receive_audio(self, ws): try: async for msg in ws: try: response = json.loads(msg) server_content = response.get("serverContent", {}) if server_content.get("interrupted"): self.interrupted = False if self.interrupted: continue model_turn = server_content.get("modelTurn") if model_turn: for part in model_turn.get("parts", []): inline_data = part.get("inlineData") if inline_data and inline_data.get("data"): now = time.time() if not self.speaking: self._ai_speaking_since = now self.speaking = True self._last_ai_audio_time = now self._ignore_input_until = now + self.ECHO_GUARD_SEC audio_bytes = base64.b64decode(inline_data["data"]) await self.audio_q.put(audio_bytes) except Exception as e: self.sanad_logger.print_and_log(f"āŒ Parse Error: {e}", message_type="error") except websockets.exceptions.ConnectionClosed: self.sanad_logger.print_and_log("āš ļø WebSocket closed.", message_type="warning") async def play_audio(self): stream = await asyncio.to_thread( self.pya.open, format=FORMAT, channels=CHANNELS, rate=RECEIVE_SAMPLE_RATE, output=True, frames_per_buffer=CHUNK_SIZE, ) buffered = False while True: try: if self.interrupted: await asyncio.sleep(0.01) continue if self.speaking and not buffered: while self.audio_q.qsize() < self.PREBUFFER_CHUNKS and self.speaking and not self.interrupted: await asyncio.sleep(0.005) buffered = True try: data = await asyncio.wait_for(self.audio_q.get(), timeout=self.PLAYBACK_TIMEOUT) except asyncio.TimeoutError: if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.20: self.speaking = False buffered = False continue if data: await asyncio.to_thread(stream.write, data) if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.20: self.speaking = False buffered = False except Exception as e: self.sanad_logger.print_and_log(f"āŒ Speaker Error: {e}", message_type="error") break