248 lines
9.0 KiB
Python
248 lines
9.0 KiB
Python
# gemini_voice.py
|
|
import asyncio
|
|
import base64
|
|
import json
|
|
import time
|
|
import array
|
|
import inspect
|
|
|
|
import pyaudio
|
|
import websockets
|
|
from Logger import Logs
|
|
|
|
FORMAT = pyaudio.paInt16
|
|
CHANNELS = 1
|
|
SEND_SAMPLE_RATE = 16000
|
|
RECEIVE_SAMPLE_RATE = 24000
|
|
CHUNK_SIZE = 256
|
|
|
|
|
|
|
|
|
|
class HamadGeminiVoice:
|
|
def __init__(self):
|
|
self.sanad_logger = Logs()
|
|
self.sanad_logger.LogEngine("G1_Logs", "gemini_voice")
|
|
|
|
|
|
self.audio_q = asyncio.Queue()
|
|
self.speaking = False
|
|
self.interrupted = False
|
|
self.pya = pyaudio.PyAudio()
|
|
|
|
self.MIN_THRESHOLD = 3000
|
|
self.barge_in_threshold = 3000
|
|
self.REQUIRED_LOUD_CHUNKS = 5
|
|
|
|
self.PREBUFFER_CHUNKS = 1
|
|
self.PLAYBACK_TIMEOUT = 0.2
|
|
self.BARGE_IN_COOLDOWN = 0.7
|
|
self.AI_SPEAK_GRACE = 0.25
|
|
|
|
self._last_ai_audio_time = 0.0
|
|
self._ai_speaking_since = 0.0
|
|
self._barge_in_block_until = 0.0
|
|
|
|
self.ECHO_GUARD_SEC = 0.8
|
|
self._ignore_input_until = 0.0
|
|
self.SEND_SILENCE_WHEN_SPEAKING = True
|
|
self.SPEAKING_ENERGY_GATE = 0.85
|
|
self._silence_pcm = b"\x00" * (CHUNK_SIZE * 2)
|
|
|
|
def audio_energy(self, pcm):
|
|
try:
|
|
samples = array.array("h", pcm)
|
|
if not samples:
|
|
return 0
|
|
return sum(abs(s) for s in samples) // len(samples)
|
|
except Exception:
|
|
return 0
|
|
|
|
def calibrate_mic(self):
|
|
self.sanad_logger.print_and_log("\n🤫 Calibrating Microphone... (Please remain silent)", message_type="info")
|
|
try:
|
|
stream = self.pya.open(
|
|
format=FORMAT,
|
|
channels=CHANNELS,
|
|
rate=SEND_SAMPLE_RATE,
|
|
input=True,
|
|
frames_per_buffer=CHUNK_SIZE,
|
|
)
|
|
values = []
|
|
for _ in range(40):
|
|
data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
|
|
values.append(self.audio_energy(data))
|
|
|
|
stream.stop_stream()
|
|
stream.close()
|
|
|
|
avg_noise = sum(values) / len(values)
|
|
self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
|
|
|
|
self.sanad_logger.print_and_log(f"✅ Baseline Noise: {avg_noise:.1f}", message_type="info")
|
|
self.sanad_logger.print_and_log(f"✅ Interruption Threshold: {self.barge_in_threshold:.1f}", message_type="info")
|
|
except Exception as e:
|
|
self.sanad_logger.print_and_log(f"⚠️ Calibration failed: {e}. Using default threshold.", message_type="warning")
|
|
|
|
def _ws_connect_kwargs(self):
|
|
kwargs = {"max_size": None}
|
|
try:
|
|
sig = inspect.signature(websockets.connect)
|
|
if "extra_headers" in sig.parameters:
|
|
kwargs["extra_headers"] = {"Content-Type": "application/json"}
|
|
else:
|
|
kwargs["additional_headers"] = {"Content-Type": "application/json"}
|
|
except Exception:
|
|
kwargs["extra_headers"] = {"Content-Type": "application/json"}
|
|
return kwargs
|
|
|
|
async def send_text_prompt(self, ws, text: str):
|
|
msg = {
|
|
"clientContent": {
|
|
"turns": [{"role": "user", "parts": [{"text": text}]}],
|
|
"turnComplete": True
|
|
}
|
|
}
|
|
await ws.send(json.dumps(msg))
|
|
|
|
async def keepalive(self, ws, every_sec: float = 20.0):
|
|
"""
|
|
Prevent idle disconnects: periodically send a websocket ping.
|
|
"""
|
|
while True:
|
|
try:
|
|
await asyncio.sleep(every_sec)
|
|
await ws.ping()
|
|
except Exception:
|
|
break
|
|
|
|
async def capture_mic(self, ws):
|
|
stream = await asyncio.to_thread(
|
|
self.pya.open,
|
|
format=FORMAT,
|
|
channels=CHANNELS,
|
|
rate=SEND_SAMPLE_RATE,
|
|
input=True,
|
|
frames_per_buffer=CHUNK_SIZE,
|
|
)
|
|
|
|
loud_chunks = 0
|
|
while True:
|
|
try:
|
|
data = await asyncio.to_thread(stream.read, CHUNK_SIZE, exception_on_overflow=False)
|
|
energy = self.audio_energy(data)
|
|
now = time.time()
|
|
|
|
if self.speaking and (now >= self._barge_in_block_until):
|
|
if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
|
|
loud_chunks = loud_chunks + 1 if energy > self.barge_in_threshold else 0
|
|
if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
|
|
self.sanad_logger.print_and_log(f"🛑 Interruption! (Energy: {energy})", message_type="warning")
|
|
self.interrupted = True
|
|
self.speaking = False
|
|
loud_chunks = 0
|
|
self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
|
|
while not self.audio_q.empty():
|
|
try:
|
|
self.audio_q.get_nowait()
|
|
except asyncio.QueueEmpty:
|
|
break
|
|
|
|
data_to_send = data
|
|
if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
|
|
gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
|
|
if energy < gate:
|
|
data_to_send = self._silence_pcm
|
|
|
|
b64_audio = base64.b64encode(data_to_send).decode("utf-8")
|
|
msg = {
|
|
"realtime_input": {
|
|
"media_chunks": [{
|
|
"data": b64_audio,
|
|
"mime_type": f"audio/pcm;rate={SEND_SAMPLE_RATE}",
|
|
}]
|
|
}
|
|
}
|
|
await ws.send(json.dumps(msg))
|
|
|
|
except websockets.exceptions.ConnectionClosed:
|
|
self.sanad_logger.print_and_log("⚠️ WebSocket closed.", message_type="warning")
|
|
break
|
|
except Exception as e:
|
|
self.sanad_logger.print_and_log(f"❌ Mic Error: {e}", message_type="error")
|
|
break
|
|
|
|
async def receive_audio(self, ws):
|
|
try:
|
|
async for msg in ws:
|
|
try:
|
|
response = json.loads(msg)
|
|
server_content = response.get("serverContent", {})
|
|
|
|
if server_content.get("interrupted"):
|
|
self.interrupted = False
|
|
|
|
if self.interrupted:
|
|
continue
|
|
|
|
model_turn = server_content.get("modelTurn")
|
|
if model_turn:
|
|
for part in model_turn.get("parts", []):
|
|
inline_data = part.get("inlineData")
|
|
if inline_data and inline_data.get("data"):
|
|
now = time.time()
|
|
if not self.speaking:
|
|
self._ai_speaking_since = now
|
|
self.speaking = True
|
|
self._last_ai_audio_time = now
|
|
self._ignore_input_until = now + self.ECHO_GUARD_SEC
|
|
|
|
audio_bytes = base64.b64decode(inline_data["data"])
|
|
await self.audio_q.put(audio_bytes)
|
|
|
|
except Exception as e:
|
|
self.sanad_logger.print_and_log(f"❌ Parse Error: {e}", message_type="error")
|
|
except websockets.exceptions.ConnectionClosed:
|
|
self.sanad_logger.print_and_log("⚠️ WebSocket closed.", message_type="warning")
|
|
|
|
async def play_audio(self):
|
|
stream = await asyncio.to_thread(
|
|
self.pya.open,
|
|
format=FORMAT,
|
|
channels=CHANNELS,
|
|
rate=RECEIVE_SAMPLE_RATE,
|
|
output=True,
|
|
frames_per_buffer=CHUNK_SIZE,
|
|
)
|
|
|
|
buffered = False
|
|
while True:
|
|
try:
|
|
if self.interrupted:
|
|
await asyncio.sleep(0.01)
|
|
continue
|
|
|
|
if self.speaking and not buffered:
|
|
while self.audio_q.qsize() < self.PREBUFFER_CHUNKS and self.speaking and not self.interrupted:
|
|
await asyncio.sleep(0.005)
|
|
buffered = True
|
|
|
|
try:
|
|
data = await asyncio.wait_for(self.audio_q.get(), timeout=self.PLAYBACK_TIMEOUT)
|
|
except asyncio.TimeoutError:
|
|
if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.20:
|
|
self.speaking = False
|
|
buffered = False
|
|
continue
|
|
|
|
if data:
|
|
await asyncio.to_thread(stream.write, data)
|
|
|
|
if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.20:
|
|
self.speaking = False
|
|
buffered = False
|
|
|
|
except Exception as e:
|
|
self.sanad_logger.print_and_log(f"❌ Speaker Error: {e}", message_type="error")
|
|
break
|