Manual_Photographer/gemini_voice.py
2026-04-12 18:53:20 +04:00

248 lines
9.0 KiB
Python

# gemini_voice.py
import asyncio
import base64
import json
import time
import array
import inspect
import pyaudio
import websockets
from Logger import Logs
FORMAT = pyaudio.paInt16
CHANNELS = 1
SEND_SAMPLE_RATE = 16000
RECEIVE_SAMPLE_RATE = 24000
CHUNK_SIZE = 256
class HamadGeminiVoice:
def __init__(self):
self.sanad_logger = Logs()
self.sanad_logger.LogEngine("G1_Logs", "gemini_voice")
self.audio_q = asyncio.Queue()
self.speaking = False
self.interrupted = False
self.pya = pyaudio.PyAudio()
self.MIN_THRESHOLD = 3000
self.barge_in_threshold = 3000
self.REQUIRED_LOUD_CHUNKS = 5
self.PREBUFFER_CHUNKS = 1
self.PLAYBACK_TIMEOUT = 0.2
self.BARGE_IN_COOLDOWN = 0.7
self.AI_SPEAK_GRACE = 0.25
self._last_ai_audio_time = 0.0
self._ai_speaking_since = 0.0
self._barge_in_block_until = 0.0
self.ECHO_GUARD_SEC = 0.8
self._ignore_input_until = 0.0
self.SEND_SILENCE_WHEN_SPEAKING = True
self.SPEAKING_ENERGY_GATE = 0.85
self._silence_pcm = b"\x00" * (CHUNK_SIZE * 2)
def audio_energy(self, pcm):
try:
samples = array.array("h", pcm)
if not samples:
return 0
return sum(abs(s) for s in samples) // len(samples)
except Exception:
return 0
def calibrate_mic(self):
self.sanad_logger.print_and_log("\n🤫 Calibrating Microphone... (Please remain silent)", message_type="info")
try:
stream = self.pya.open(
format=FORMAT,
channels=CHANNELS,
rate=SEND_SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK_SIZE,
)
values = []
for _ in range(40):
data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
values.append(self.audio_energy(data))
stream.stop_stream()
stream.close()
avg_noise = sum(values) / len(values)
self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
self.sanad_logger.print_and_log(f"✅ Baseline Noise: {avg_noise:.1f}", message_type="info")
self.sanad_logger.print_and_log(f"✅ Interruption Threshold: {self.barge_in_threshold:.1f}", message_type="info")
except Exception as e:
self.sanad_logger.print_and_log(f"⚠️ Calibration failed: {e}. Using default threshold.", message_type="warning")
def _ws_connect_kwargs(self):
kwargs = {"max_size": None}
try:
sig = inspect.signature(websockets.connect)
if "extra_headers" in sig.parameters:
kwargs["extra_headers"] = {"Content-Type": "application/json"}
else:
kwargs["additional_headers"] = {"Content-Type": "application/json"}
except Exception:
kwargs["extra_headers"] = {"Content-Type": "application/json"}
return kwargs
async def send_text_prompt(self, ws, text: str):
msg = {
"clientContent": {
"turns": [{"role": "user", "parts": [{"text": text}]}],
"turnComplete": True
}
}
await ws.send(json.dumps(msg))
async def keepalive(self, ws, every_sec: float = 20.0):
"""
Prevent idle disconnects: periodically send a websocket ping.
"""
while True:
try:
await asyncio.sleep(every_sec)
await ws.ping()
except Exception:
break
async def capture_mic(self, ws):
stream = await asyncio.to_thread(
self.pya.open,
format=FORMAT,
channels=CHANNELS,
rate=SEND_SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK_SIZE,
)
loud_chunks = 0
while True:
try:
data = await asyncio.to_thread(stream.read, CHUNK_SIZE, exception_on_overflow=False)
energy = self.audio_energy(data)
now = time.time()
if self.speaking and (now >= self._barge_in_block_until):
if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
loud_chunks = loud_chunks + 1 if energy > self.barge_in_threshold else 0
if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
self.sanad_logger.print_and_log(f"🛑 Interruption! (Energy: {energy})", message_type="warning")
self.interrupted = True
self.speaking = False
loud_chunks = 0
self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
while not self.audio_q.empty():
try:
self.audio_q.get_nowait()
except asyncio.QueueEmpty:
break
data_to_send = data
if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
if energy < gate:
data_to_send = self._silence_pcm
b64_audio = base64.b64encode(data_to_send).decode("utf-8")
msg = {
"realtime_input": {
"media_chunks": [{
"data": b64_audio,
"mime_type": f"audio/pcm;rate={SEND_SAMPLE_RATE}",
}]
}
}
await ws.send(json.dumps(msg))
except websockets.exceptions.ConnectionClosed:
self.sanad_logger.print_and_log("⚠️ WebSocket closed.", message_type="warning")
break
except Exception as e:
self.sanad_logger.print_and_log(f"❌ Mic Error: {e}", message_type="error")
break
async def receive_audio(self, ws):
try:
async for msg in ws:
try:
response = json.loads(msg)
server_content = response.get("serverContent", {})
if server_content.get("interrupted"):
self.interrupted = False
if self.interrupted:
continue
model_turn = server_content.get("modelTurn")
if model_turn:
for part in model_turn.get("parts", []):
inline_data = part.get("inlineData")
if inline_data and inline_data.get("data"):
now = time.time()
if not self.speaking:
self._ai_speaking_since = now
self.speaking = True
self._last_ai_audio_time = now
self._ignore_input_until = now + self.ECHO_GUARD_SEC
audio_bytes = base64.b64decode(inline_data["data"])
await self.audio_q.put(audio_bytes)
except Exception as e:
self.sanad_logger.print_and_log(f"❌ Parse Error: {e}", message_type="error")
except websockets.exceptions.ConnectionClosed:
self.sanad_logger.print_and_log("⚠️ WebSocket closed.", message_type="warning")
async def play_audio(self):
stream = await asyncio.to_thread(
self.pya.open,
format=FORMAT,
channels=CHANNELS,
rate=RECEIVE_SAMPLE_RATE,
output=True,
frames_per_buffer=CHUNK_SIZE,
)
buffered = False
while True:
try:
if self.interrupted:
await asyncio.sleep(0.01)
continue
if self.speaking and not buffered:
while self.audio_q.qsize() < self.PREBUFFER_CHUNKS and self.speaking and not self.interrupted:
await asyncio.sleep(0.005)
buffered = True
try:
data = await asyncio.wait_for(self.audio_q.get(), timeout=self.PLAYBACK_TIMEOUT)
except asyncio.TimeoutError:
if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.20:
self.speaking = False
buffered = False
continue
if data:
await asyncio.to_thread(stream.write, data)
if self.audio_q.empty() and (time.time() - self._last_ai_audio_time) > 0.20:
self.speaking = False
buffered = False
except Exception as e:
self.sanad_logger.print_and_log(f"❌ Speaker Error: {e}", message_type="error")
break