609 lines
23 KiB
Python
609 lines
23 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Voice/marcus_gemini_voice.py — Marcus Gemini Live Voice Module v2
|
|
==================================================================
|
|
Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio.
|
|
Uses G1 built-in speaker + Hollyland wireless mic.
|
|
|
|
Based on SanadVoice/gemini_interact architecture:
|
|
- PyAudio for mic (not parec)
|
|
- Echo suppression (silence when speaking)
|
|
- Gemini VAD (automatic activity detection)
|
|
- thinkingBudget=0 (no thinking text)
|
|
- ASR buffering for full sentences
|
|
- Vision routed to brain's Qwen camera
|
|
|
|
Usage:
|
|
from Voice.marcus_gemini_voice import GeminiVoiceModule
|
|
voice = GeminiVoiceModule(audio_api, on_transcript=callback)
|
|
voice.start()
|
|
"""
|
|
|
|
import array
|
|
import asyncio
|
|
import base64
|
|
import json
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import numpy as np
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
|
|
PROJECT_NAME = "Marcus"
|
|
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
|
|
|
|
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
|
handlers=[
|
|
logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
|
|
logging.StreamHandler(),
|
|
],
|
|
)
|
|
log = logging.getLogger("gemini_voice")
|
|
|
|
|
|
def load_config(name: str) -> dict:
|
|
path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
|
|
with open(path, "r") as f:
|
|
return json.load(f)
|
|
|
|
|
|
# ─── CONFIGURATION ────────────────────────────────────────
|
|
|
|
API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8"
|
|
MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
|
|
URI = (
|
|
"wss://generativelanguage.googleapis.com/ws/"
|
|
"google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
|
|
f"?key={API_KEY}"
|
|
)
|
|
|
|
VOICE_NAME = "Charon"
|
|
SEND_RATE = 16000
|
|
RECEIVE_RATE = 24000
|
|
CHUNK_SIZE = 512
|
|
CHANNELS = 1
|
|
|
|
|
|
def load_system_prompt():
|
|
paths = [
|
|
os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"),
|
|
]
|
|
for p in paths:
|
|
if os.path.exists(p):
|
|
with open(p, "r", encoding="utf-8-sig") as f:
|
|
return f.read().strip()
|
|
return (
|
|
"You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. "
|
|
"Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max."
|
|
)
|
|
|
|
|
|
# ─── AUDIO HELPERS ────────────────────────────────────────
|
|
|
|
def audio_energy(pcm: bytes) -> int:
|
|
try:
|
|
samples = array.array("h", pcm)
|
|
if not samples:
|
|
return 0
|
|
return sum(abs(s) for s in samples) // len(samples)
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2)
|
|
|
|
|
|
# ─── GEMINI VOICE MODULE ─────────────────────────────────
|
|
|
|
class GeminiVoiceModule:
|
|
"""Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic."""
|
|
|
|
def __init__(self, audio_api, on_transcript=None):
|
|
self._audio = audio_api
|
|
self._on_transcript = on_transcript
|
|
self._config = load_config("Voice")
|
|
self._mic_source = getattr(audio_api, '_mic_source',
|
|
self._config["mic"].get("source_index", "0"))
|
|
|
|
# State
|
|
self.speaking = False
|
|
self.interrupted = False
|
|
self._running = False
|
|
self._thread = None
|
|
self._audio_queue = None # Created in async context
|
|
|
|
# Tuning
|
|
self.MIN_THRESHOLD = 3000
|
|
self.barge_in_threshold = self.MIN_THRESHOLD
|
|
self.REQUIRED_LOUD_CHUNKS = 10
|
|
self.PREBUFFER_CHUNKS = 2
|
|
self.PLAYBACK_TIMEOUT = 0.25
|
|
self.BARGE_IN_COOLDOWN = 0.7
|
|
self.AI_SPEAK_GRACE = 0.20
|
|
self.ECHO_GUARD_SEC = 0.8
|
|
self.SPEAKING_ENERGY_GATE = 0.85
|
|
self.SEND_SILENCE_WHEN_SPEAKING = True
|
|
|
|
# Timing
|
|
self._ai_speaking_since = 0.0
|
|
self._last_ai_audio_time = 0.0
|
|
self._barge_in_block_until = 0.0
|
|
self._ignore_input_until = 0.0
|
|
|
|
# ASR buffer
|
|
self._asr_buf = ""
|
|
self._asr_last_time = 0.0
|
|
self.ASR_WINDOW_SEC = 2.0
|
|
|
|
# Find Hollyland mic PyAudio device index
|
|
self._mic_device_idx = self._find_mic_device()
|
|
|
|
log.info("GeminiVoiceModule v2 initialized")
|
|
|
|
# ─── MIC DEVICE DETECTION ─────────────────────────────
|
|
|
|
def _find_mic_device(self) -> int:
|
|
"""Find Hollyland wireless mic in PyAudio devices. Returns device index."""
|
|
import pyaudio
|
|
import ctypes
|
|
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
|
|
def _alsa_error_handler(filename, line, function, err, fmt):
|
|
pass # suppress
|
|
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
|
|
try:
|
|
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
|
|
asound.snd_lib_error_set_handler(c_error_handler)
|
|
except: pass # ALSA_suppress
|
|
pa = pyaudio.PyAudio()
|
|
try:
|
|
# First: set PulseAudio default source to Hollyland
|
|
subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
|
|
subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
|
|
|
|
# Search for wireless mic by name
|
|
for i in range(pa.get_device_count()):
|
|
info = pa.get_device_info_by_index(i)
|
|
name = info.get("name", "").lower()
|
|
if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name):
|
|
log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"]))
|
|
return i
|
|
|
|
# Fallback to 'default' or 'pulse' device
|
|
for i in range(pa.get_device_count()):
|
|
info = pa.get_device_info_by_index(i)
|
|
if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"):
|
|
log.info("Mic fallback: [%d] %s", i, info["name"])
|
|
return i
|
|
|
|
log.warning("No mic found, using device 0")
|
|
return 0
|
|
finally:
|
|
pa.terminate()
|
|
|
|
# ─── MIC CALIBRATION ──────────────────────────────────
|
|
|
|
def _calibrate_mic(self):
|
|
"""Calibrate barge-in threshold from ambient noise."""
|
|
import pyaudio
|
|
import ctypes
|
|
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
|
|
def _alsa_error_handler(filename, line, function, err, fmt):
|
|
pass # suppress
|
|
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
|
|
try:
|
|
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
|
|
asound.snd_lib_error_set_handler(c_error_handler)
|
|
except: pass # ALSA_suppress
|
|
pa = pyaudio.PyAudio()
|
|
mic_info = pa.get_device_info_by_index(self._mic_device_idx)
|
|
mic_rate = int(mic_info["defaultSampleRate"])
|
|
mic_channels = 1
|
|
try:
|
|
stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
|
|
rate=mic_rate, input=True,
|
|
input_device_index=self._mic_device_idx,
|
|
frames_per_buffer=CHUNK_SIZE)
|
|
values = []
|
|
for _ in range(40):
|
|
data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
|
|
values.append(audio_energy(data))
|
|
stream.stop_stream()
|
|
stream.close()
|
|
avg_noise = sum(values) / len(values) if values else 0
|
|
self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
|
|
log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold)
|
|
except Exception as e:
|
|
log.warning("Calibration failed: %s", e)
|
|
finally:
|
|
pa.terminate()
|
|
|
|
# ─── G1 SPEAKER PLAYBACK ─────────────────────────────
|
|
|
|
def _play_buffer_on_g1(self, pcm_24k: np.ndarray):
|
|
"""Play 24kHz audio on G1 speaker (resample to 16kHz, single call)."""
|
|
if len(pcm_24k) < 100:
|
|
return
|
|
|
|
# Resample 24kHz → 16kHz
|
|
tl = int(len(pcm_24k) * 16000 / 24000)
|
|
audio_16k = np.interp(
|
|
np.linspace(0, len(pcm_24k), tl, endpoint=False),
|
|
np.arange(len(pcm_24k)),
|
|
pcm_24k.astype(np.float64),
|
|
).astype(np.int16)
|
|
|
|
from unitree_sdk2py.g1.audio.g1_audio_api import (
|
|
ROBOT_API_ID_AUDIO_START_PLAY,
|
|
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
|
)
|
|
|
|
client = self._audio._client
|
|
if not client:
|
|
return
|
|
|
|
app_name = "gemini"
|
|
client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
|
|
time.sleep(0.1)
|
|
|
|
pcm = audio_16k.tobytes()
|
|
sid = f"s_{int(time.time() * 1000)}"
|
|
param = json.dumps({
|
|
"app_name": app_name,
|
|
"stream_id": sid,
|
|
"sample_rate": 16000,
|
|
"channels": 1,
|
|
"bits_per_sample": 16,
|
|
})
|
|
client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm))
|
|
|
|
duration = len(audio_16k) / 16000
|
|
time.sleep(duration + 0.3)
|
|
client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
|
|
|
|
# ─── WEBSOCKET TASKS ─────────────────────────────────
|
|
|
|
async def _capture_mic(self, ws):
|
|
"""Continuously capture mic via PyAudio and send to Gemini."""
|
|
import pyaudio
|
|
import ctypes
|
|
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
|
|
def _alsa_error_handler(filename, line, function, err, fmt):
|
|
pass # suppress
|
|
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
|
|
try:
|
|
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
|
|
asound.snd_lib_error_set_handler(c_error_handler)
|
|
except: pass # ALSA_suppress
|
|
pa = pyaudio.PyAudio()
|
|
|
|
mic_info = pa.get_device_info_by_index(self._mic_device_idx)
|
|
mic_rate = int(mic_info["defaultSampleRate"])
|
|
mic_channels = 1
|
|
|
|
# Open mic at native rate/channels
|
|
stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
|
|
rate=mic_rate, input=True,
|
|
input_device_index=self._mic_device_idx,
|
|
frames_per_buffer=CHUNK_SIZE)
|
|
|
|
log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels)
|
|
|
|
loud_chunks = 0
|
|
loop = asyncio.get_event_loop()
|
|
needs_resample = mic_rate != SEND_RATE or mic_channels != 1
|
|
|
|
try:
|
|
while self._running:
|
|
data = await loop.run_in_executor(
|
|
None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False))
|
|
|
|
# Convert to mono 16kHz if needed
|
|
if needs_resample:
|
|
audio = np.frombuffer(data, dtype=np.int16)
|
|
# Stereo to mono
|
|
if mic_channels == 2:
|
|
audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16)
|
|
# Resample to 16kHz
|
|
if mic_rate != SEND_RATE:
|
|
tl = int(len(audio) * SEND_RATE / mic_rate)
|
|
if tl > 0:
|
|
audio = np.interp(
|
|
np.linspace(0, len(audio), tl, endpoint=False),
|
|
np.arange(len(audio)),
|
|
audio.astype(np.float64),
|
|
).astype(np.int16)
|
|
data = audio.tobytes()
|
|
|
|
energy = audio_energy(data)
|
|
now = time.time()
|
|
|
|
# Barge-in detection
|
|
if self.speaking and now >= self._barge_in_block_until:
|
|
if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
|
|
if energy > self.barge_in_threshold:
|
|
loud_chunks += 1
|
|
else:
|
|
loud_chunks = 0
|
|
if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
|
|
log.info("Barge-in detected!")
|
|
self.interrupted = True
|
|
self.speaking = False
|
|
while not self._audio_queue.empty():
|
|
try: self._audio_queue.get_nowait()
|
|
except: break
|
|
loud_chunks = 0
|
|
self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
|
|
|
|
# Echo suppression: send silence while speaking
|
|
data_to_send = data
|
|
if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
|
|
gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
|
|
if energy < gate:
|
|
data_to_send = SILENCE_PCM
|
|
|
|
# Send to Gemini
|
|
b64 = base64.b64encode(data_to_send).decode()
|
|
msg = {
|
|
"realtime_input": {
|
|
"media_chunks": [
|
|
{"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64}
|
|
]
|
|
}
|
|
}
|
|
await ws.send(json.dumps(msg))
|
|
|
|
except Exception as e:
|
|
if self._running:
|
|
log.error("Mic error: %s", e)
|
|
finally:
|
|
stream.stop_stream()
|
|
stream.close()
|
|
pa.terminate()
|
|
|
|
async def _receive_audio(self, ws):
|
|
"""Receive audio responses and transcriptions from Gemini."""
|
|
async for msg in ws:
|
|
if not self._running:
|
|
break
|
|
try:
|
|
response = json.loads(msg)
|
|
server_content = response.get("serverContent", {})
|
|
|
|
if server_content.get("interrupted"):
|
|
self.interrupted = False
|
|
|
|
# User transcription (partial/streaming)
|
|
input_tr = (
|
|
server_content.get("inputTranscription")
|
|
or server_content.get("input_transcription")
|
|
or server_content.get("inputAudioTranscription")
|
|
or server_content.get("input_audio_transcription")
|
|
)
|
|
if isinstance(input_tr, dict):
|
|
text = (input_tr.get("text") or "").strip()
|
|
now = time.time()
|
|
if text and now >= self._ignore_input_until and not self.speaking:
|
|
# Buffer ASR text
|
|
if now - self._asr_last_time > self.ASR_WINDOW_SEC:
|
|
self._asr_buf = ""
|
|
self._asr_buf = text # Gemini sends cumulative transcription
|
|
self._asr_last_time = now
|
|
|
|
if self.interrupted:
|
|
continue
|
|
|
|
# Audio from Gemini
|
|
model_turn = server_content.get("modelTurn")
|
|
if model_turn:
|
|
for part in model_turn.get("parts", []):
|
|
inline_data = part.get("inlineData")
|
|
if inline_data:
|
|
audio_b64 = inline_data.get("data")
|
|
if audio_b64:
|
|
now = time.time()
|
|
if not self.speaking:
|
|
self._ai_speaking_since = now
|
|
# Gemini started responding — fire transcript callback
|
|
if self._asr_buf and self._on_transcript:
|
|
self._on_transcript(self._asr_buf, "user")
|
|
self.speaking = True
|
|
self._last_ai_audio_time = now
|
|
self._ignore_input_until = now + self.ECHO_GUARD_SEC
|
|
audio_bytes = base64.b64decode(audio_b64)
|
|
await self._audio_queue.put(audio_bytes)
|
|
|
|
# Text from Gemini (thinking/response text)
|
|
text_part = part.get("text", "").strip()
|
|
if text_part and self._on_transcript:
|
|
self._on_transcript(text_part, "marcus")
|
|
|
|
# Turn complete — Gemini finished speaking
|
|
turn_complete = server_content.get("turnComplete")
|
|
if turn_complete:
|
|
# Clear ASR buffer after turn
|
|
self._asr_buf = ""
|
|
|
|
except Exception as e:
|
|
log.error("Receive error: %s", e)
|
|
|
|
async def _play_audio(self):
|
|
"""Collect Gemini audio chunks and play on G1 speaker."""
|
|
while self._running:
|
|
try:
|
|
if not self.speaking:
|
|
await asyncio.sleep(0.05)
|
|
continue
|
|
|
|
# Pre-buffer
|
|
buffered = False
|
|
while self.speaking and not buffered:
|
|
if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS:
|
|
buffered = True
|
|
else:
|
|
await asyncio.sleep(0.01)
|
|
|
|
# Collect all audio chunks
|
|
buffer_chunks = []
|
|
while self.speaking:
|
|
try:
|
|
data = await asyncio.wait_for(
|
|
self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT)
|
|
audio = np.frombuffer(data, dtype=np.int16)
|
|
buffer_chunks.append(audio)
|
|
self._last_ai_audio_time = time.time()
|
|
except asyncio.TimeoutError:
|
|
if self._audio_queue.empty():
|
|
if time.time() - self._last_ai_audio_time > 0.3:
|
|
break
|
|
|
|
# Play on G1 speaker
|
|
if buffer_chunks:
|
|
full_audio = np.concatenate(buffer_chunks)
|
|
duration = len(full_audio) / RECEIVE_RATE
|
|
log.info("Playing %.1fs on G1", duration)
|
|
|
|
await asyncio.get_event_loop().run_in_executor(
|
|
None, self._play_buffer_on_g1, full_audio)
|
|
|
|
self.speaking = False
|
|
|
|
except Exception as e:
|
|
log.error("Play error: %s", e)
|
|
self.speaking = False
|
|
|
|
# ─── MAIN LOOP ────────────────────────────────────────
|
|
|
|
async def _run_async(self):
|
|
import websockets
|
|
import inspect
|
|
|
|
system_prompt = load_system_prompt()
|
|
|
|
# Unmute mic
|
|
subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
|
|
subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
|
|
|
|
# Calibrate
|
|
self._calibrate_mic()
|
|
|
|
ws_kwargs = {"max_size": None}
|
|
try:
|
|
sig = inspect.signature(websockets.connect)
|
|
if "extra_headers" in sig.parameters:
|
|
ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
|
|
else:
|
|
ws_kwargs["additional_headers"] = {"Content-Type": "application/json"}
|
|
except Exception:
|
|
ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
|
|
|
|
while self._running:
|
|
try:
|
|
log.info("Connecting to Gemini...")
|
|
async with websockets.connect(URI, **ws_kwargs) as ws:
|
|
setup_msg = {
|
|
"setup": {
|
|
"model": MODEL,
|
|
"generationConfig": {
|
|
"responseModalities": ["AUDIO"],
|
|
"thinkingConfig": {"thinkingBudget": 0},
|
|
"speechConfig": {
|
|
"voiceConfig": {
|
|
"prebuiltVoiceConfig": {"voiceName": VOICE_NAME}
|
|
}
|
|
},
|
|
},
|
|
"realtimeInputConfig": {
|
|
"automaticActivityDetection": {
|
|
"startOfSpeechSensitivity": "START_SENSITIVITY_HIGH",
|
|
"prefixPaddingMs": 40,
|
|
"endOfSpeechSensitivity": "END_SENSITIVITY_HIGH",
|
|
"silenceDurationMs": 250,
|
|
}
|
|
},
|
|
"inputAudioTranscription": {},
|
|
"systemInstruction": {"parts": [{"text": system_prompt}]},
|
|
}
|
|
}
|
|
await ws.send(json.dumps(setup_msg))
|
|
await ws.recv()
|
|
log.info("Connected! Always listening...")
|
|
|
|
self._audio_queue = asyncio.Queue()
|
|
|
|
await asyncio.gather(
|
|
self._capture_mic(ws),
|
|
self._receive_audio(ws),
|
|
self._play_audio(),
|
|
)
|
|
|
|
except Exception as e:
|
|
if self._running:
|
|
log.error("Connection error: %s — reconnecting in 3s", e)
|
|
await asyncio.sleep(3)
|
|
|
|
def _voice_thread(self):
|
|
asyncio.run(self._run_async())
|
|
|
|
# ─── START / STOP ─────────────────────────────────────
|
|
|
|
def start(self):
|
|
if self._running:
|
|
return
|
|
self._running = True
|
|
self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice")
|
|
self._thread.start()
|
|
log.info("Gemini voice module started")
|
|
|
|
def stop(self):
|
|
self._running = False
|
|
if self._thread:
|
|
self._thread.join(timeout=5)
|
|
self._thread = None
|
|
log.info("Gemini voice module stopped")
|
|
|
|
@property
|
|
def is_running(self) -> bool:
|
|
return self._running
|
|
|
|
@property
|
|
def state(self) -> str:
|
|
return "LISTENING" if self._running else "STOPPED"
|
|
|
|
@property
|
|
def is_speaking(self) -> bool:
|
|
return self.speaking
|
|
|
|
|
|
# ─── STANDALONE TEST ─────────────────────────────────────
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.path.insert(0, PROJECT_ROOT)
|
|
from API.audio_api import AudioAPI
|
|
|
|
def on_transcript(text, role):
|
|
print(f" [{role.upper()}] {text}")
|
|
|
|
audio = AudioAPI()
|
|
voice = GeminiVoiceModule(audio, on_transcript=on_transcript)
|
|
|
|
print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n")
|
|
voice.start()
|
|
|
|
try:
|
|
while voice.is_running:
|
|
time.sleep(0.5)
|
|
except KeyboardInterrupt:
|
|
print("\nStopping...")
|
|
voice.stop()
|