#!/usr/bin/env python3 """ Voice/marcus_gemini_voice.py — Marcus Gemini Live Voice Module v2 ================================================================== Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio. Uses G1 built-in speaker + Hollyland wireless mic. Based on SanadVoice/gemini_interact architecture: - PyAudio for mic (not parec) - Echo suppression (silence when speaking) - Gemini VAD (automatic activity detection) - thinkingBudget=0 (no thinking text) - ASR buffering for full sentences - Vision routed to brain's Qwen camera Usage: from Voice.marcus_gemini_voice import GeminiVoiceModule voice = GeminiVoiceModule(audio_api, on_transcript=callback) voice.start() """ import array import asyncio import base64 import json import logging import os import subprocess import threading import time import numpy as np from dotenv import load_dotenv load_dotenv() BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree") PROJECT_NAME = "Marcus" PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME) LOG_DIR = os.path.join(PROJECT_ROOT, "logs") os.makedirs(LOG_DIR, exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", handlers=[ logging.FileHandler(os.path.join(LOG_DIR, "voice.log")), logging.StreamHandler(), ], ) log = logging.getLogger("gemini_voice") def load_config(name: str) -> dict: path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json") with open(path, "r") as f: return json.load(f) # ─── CONFIGURATION ──────────────────────────────────────── API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8" MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025" URI = ( "wss://generativelanguage.googleapis.com/ws/" "google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent" f"?key={API_KEY}" ) VOICE_NAME = "Charon" SEND_RATE = 16000 RECEIVE_RATE = 24000 CHUNK_SIZE = 512 CHANNELS = 1 def load_system_prompt(): paths = [ os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"), ] for p in paths: if os.path.exists(p): with open(p, "r", encoding="utf-8-sig") as f: return f.read().strip() return ( "You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. " "Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max." ) # ─── AUDIO HELPERS ──────────────────────────────────────── def audio_energy(pcm: bytes) -> int: try: samples = array.array("h", pcm) if not samples: return 0 return sum(abs(s) for s in samples) // len(samples) except Exception: return 0 SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2) # ─── GEMINI VOICE MODULE ───────────────────────────────── class GeminiVoiceModule: """Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic.""" def __init__(self, audio_api, on_transcript=None): self._audio = audio_api self._on_transcript = on_transcript self._config = load_config("Voice") self._mic_source = getattr(audio_api, '_mic_source', self._config["mic"].get("source_index", "0")) # State self.speaking = False self.interrupted = False self._running = False self._thread = None self._audio_queue = None # Created in async context # Tuning self.MIN_THRESHOLD = 3000 self.barge_in_threshold = self.MIN_THRESHOLD self.REQUIRED_LOUD_CHUNKS = 10 self.PREBUFFER_CHUNKS = 2 self.PLAYBACK_TIMEOUT = 0.25 self.BARGE_IN_COOLDOWN = 0.7 self.AI_SPEAK_GRACE = 0.20 self.ECHO_GUARD_SEC = 0.8 self.SPEAKING_ENERGY_GATE = 0.85 self.SEND_SILENCE_WHEN_SPEAKING = True # Timing self._ai_speaking_since = 0.0 self._last_ai_audio_time = 0.0 self._barge_in_block_until = 0.0 self._ignore_input_until = 0.0 # ASR buffer self._asr_buf = "" self._asr_last_time = 0.0 self.ASR_WINDOW_SEC = 2.0 # Find Hollyland mic PyAudio device index self._mic_device_idx = self._find_mic_device() log.info("GeminiVoiceModule v2 initialized") # ─── MIC DEVICE DETECTION ───────────────────────────── def _find_mic_device(self) -> int: """Find Hollyland wireless mic in PyAudio devices. Returns device index.""" import pyaudio import ctypes ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p) def _alsa_error_handler(filename, line, function, err, fmt): pass # suppress c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler) try: asound = ctypes.cdll.LoadLibrary("libasound.so.2") asound.snd_lib_error_set_handler(c_error_handler) except: pass # ALSA_suppress pa = pyaudio.PyAudio() try: # First: set PulseAudio default source to Hollyland subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True) subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True) # Search for wireless mic by name for i in range(pa.get_device_count()): info = pa.get_device_info_by_index(i) name = info.get("name", "").lower() if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name): log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"])) return i # Fallback to 'default' or 'pulse' device for i in range(pa.get_device_count()): info = pa.get_device_info_by_index(i) if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"): log.info("Mic fallback: [%d] %s", i, info["name"]) return i log.warning("No mic found, using device 0") return 0 finally: pa.terminate() # ─── MIC CALIBRATION ────────────────────────────────── def _calibrate_mic(self): """Calibrate barge-in threshold from ambient noise.""" import pyaudio import ctypes ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p) def _alsa_error_handler(filename, line, function, err, fmt): pass # suppress c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler) try: asound = ctypes.cdll.LoadLibrary("libasound.so.2") asound.snd_lib_error_set_handler(c_error_handler) except: pass # ALSA_suppress pa = pyaudio.PyAudio() mic_info = pa.get_device_info_by_index(self._mic_device_idx) mic_rate = int(mic_info["defaultSampleRate"]) mic_channels = 1 try: stream = pa.open(format=pyaudio.paInt16, channels=mic_channels, rate=mic_rate, input=True, input_device_index=self._mic_device_idx, frames_per_buffer=CHUNK_SIZE) values = [] for _ in range(40): data = stream.read(CHUNK_SIZE, exception_on_overflow=False) values.append(audio_energy(data)) stream.stop_stream() stream.close() avg_noise = sum(values) / len(values) if values else 0 self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0) log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold) except Exception as e: log.warning("Calibration failed: %s", e) finally: pa.terminate() # ─── G1 SPEAKER PLAYBACK ───────────────────────────── def _play_buffer_on_g1(self, pcm_24k: np.ndarray): """Play 24kHz audio on G1 speaker (resample to 16kHz, single call).""" if len(pcm_24k) < 100: return # Resample 24kHz → 16kHz tl = int(len(pcm_24k) * 16000 / 24000) audio_16k = np.interp( np.linspace(0, len(pcm_24k), tl, endpoint=False), np.arange(len(pcm_24k)), pcm_24k.astype(np.float64), ).astype(np.int16) from unitree_sdk2py.g1.audio.g1_audio_api import ( ROBOT_API_ID_AUDIO_START_PLAY, ROBOT_API_ID_AUDIO_STOP_PLAY, ) client = self._audio._client if not client: return app_name = "gemini" client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name})) time.sleep(0.1) pcm = audio_16k.tobytes() sid = f"s_{int(time.time() * 1000)}" param = json.dumps({ "app_name": app_name, "stream_id": sid, "sample_rate": 16000, "channels": 1, "bits_per_sample": 16, }) client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm)) duration = len(audio_16k) / 16000 time.sleep(duration + 0.3) client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name})) # ─── WEBSOCKET TASKS ───────────────────────────────── async def _capture_mic(self, ws): """Continuously capture mic via PyAudio and send to Gemini.""" import pyaudio import ctypes ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p) def _alsa_error_handler(filename, line, function, err, fmt): pass # suppress c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler) try: asound = ctypes.cdll.LoadLibrary("libasound.so.2") asound.snd_lib_error_set_handler(c_error_handler) except: pass # ALSA_suppress pa = pyaudio.PyAudio() mic_info = pa.get_device_info_by_index(self._mic_device_idx) mic_rate = int(mic_info["defaultSampleRate"]) mic_channels = 1 # Open mic at native rate/channels stream = pa.open(format=pyaudio.paInt16, channels=mic_channels, rate=mic_rate, input=True, input_device_index=self._mic_device_idx, frames_per_buffer=CHUNK_SIZE) log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels) loud_chunks = 0 loop = asyncio.get_event_loop() needs_resample = mic_rate != SEND_RATE or mic_channels != 1 try: while self._running: data = await loop.run_in_executor( None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False)) # Convert to mono 16kHz if needed if needs_resample: audio = np.frombuffer(data, dtype=np.int16) # Stereo to mono if mic_channels == 2: audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16) # Resample to 16kHz if mic_rate != SEND_RATE: tl = int(len(audio) * SEND_RATE / mic_rate) if tl > 0: audio = np.interp( np.linspace(0, len(audio), tl, endpoint=False), np.arange(len(audio)), audio.astype(np.float64), ).astype(np.int16) data = audio.tobytes() energy = audio_energy(data) now = time.time() # Barge-in detection if self.speaking and now >= self._barge_in_block_until: if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE: if energy > self.barge_in_threshold: loud_chunks += 1 else: loud_chunks = 0 if loud_chunks > self.REQUIRED_LOUD_CHUNKS: log.info("Barge-in detected!") self.interrupted = True self.speaking = False while not self._audio_queue.empty(): try: self._audio_queue.get_nowait() except: break loud_chunks = 0 self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN # Echo suppression: send silence while speaking data_to_send = data if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking: gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE if energy < gate: data_to_send = SILENCE_PCM # Send to Gemini b64 = base64.b64encode(data_to_send).decode() msg = { "realtime_input": { "media_chunks": [ {"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64} ] } } await ws.send(json.dumps(msg)) except Exception as e: if self._running: log.error("Mic error: %s", e) finally: stream.stop_stream() stream.close() pa.terminate() async def _receive_audio(self, ws): """Receive audio responses and transcriptions from Gemini.""" async for msg in ws: if not self._running: break try: response = json.loads(msg) server_content = response.get("serverContent", {}) if server_content.get("interrupted"): self.interrupted = False # User transcription (partial/streaming) input_tr = ( server_content.get("inputTranscription") or server_content.get("input_transcription") or server_content.get("inputAudioTranscription") or server_content.get("input_audio_transcription") ) if isinstance(input_tr, dict): text = (input_tr.get("text") or "").strip() now = time.time() if text and now >= self._ignore_input_until and not self.speaking: # Buffer ASR text if now - self._asr_last_time > self.ASR_WINDOW_SEC: self._asr_buf = "" self._asr_buf = text # Gemini sends cumulative transcription self._asr_last_time = now if self.interrupted: continue # Audio from Gemini model_turn = server_content.get("modelTurn") if model_turn: for part in model_turn.get("parts", []): inline_data = part.get("inlineData") if inline_data: audio_b64 = inline_data.get("data") if audio_b64: now = time.time() if not self.speaking: self._ai_speaking_since = now # Gemini started responding — fire transcript callback if self._asr_buf and self._on_transcript: self._on_transcript(self._asr_buf, "user") self.speaking = True self._last_ai_audio_time = now self._ignore_input_until = now + self.ECHO_GUARD_SEC audio_bytes = base64.b64decode(audio_b64) await self._audio_queue.put(audio_bytes) # Text from Gemini (thinking/response text) text_part = part.get("text", "").strip() if text_part and self._on_transcript: self._on_transcript(text_part, "marcus") # Turn complete — Gemini finished speaking turn_complete = server_content.get("turnComplete") if turn_complete: # Clear ASR buffer after turn self._asr_buf = "" except Exception as e: log.error("Receive error: %s", e) async def _play_audio(self): """Collect Gemini audio chunks and play on G1 speaker.""" while self._running: try: if not self.speaking: await asyncio.sleep(0.05) continue # Pre-buffer buffered = False while self.speaking and not buffered: if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS: buffered = True else: await asyncio.sleep(0.01) # Collect all audio chunks buffer_chunks = [] while self.speaking: try: data = await asyncio.wait_for( self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT) audio = np.frombuffer(data, dtype=np.int16) buffer_chunks.append(audio) self._last_ai_audio_time = time.time() except asyncio.TimeoutError: if self._audio_queue.empty(): if time.time() - self._last_ai_audio_time > 0.3: break # Play on G1 speaker if buffer_chunks: full_audio = np.concatenate(buffer_chunks) duration = len(full_audio) / RECEIVE_RATE log.info("Playing %.1fs on G1", duration) await asyncio.get_event_loop().run_in_executor( None, self._play_buffer_on_g1, full_audio) self.speaking = False except Exception as e: log.error("Play error: %s", e) self.speaking = False # ─── MAIN LOOP ──────────────────────────────────────── async def _run_async(self): import websockets import inspect system_prompt = load_system_prompt() # Unmute mic subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True) subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True) # Calibrate self._calibrate_mic() ws_kwargs = {"max_size": None} try: sig = inspect.signature(websockets.connect) if "extra_headers" in sig.parameters: ws_kwargs["extra_headers"] = {"Content-Type": "application/json"} else: ws_kwargs["additional_headers"] = {"Content-Type": "application/json"} except Exception: ws_kwargs["extra_headers"] = {"Content-Type": "application/json"} while self._running: try: log.info("Connecting to Gemini...") async with websockets.connect(URI, **ws_kwargs) as ws: setup_msg = { "setup": { "model": MODEL, "generationConfig": { "responseModalities": ["AUDIO"], "thinkingConfig": {"thinkingBudget": 0}, "speechConfig": { "voiceConfig": { "prebuiltVoiceConfig": {"voiceName": VOICE_NAME} } }, }, "realtimeInputConfig": { "automaticActivityDetection": { "startOfSpeechSensitivity": "START_SENSITIVITY_HIGH", "prefixPaddingMs": 40, "endOfSpeechSensitivity": "END_SENSITIVITY_HIGH", "silenceDurationMs": 250, } }, "inputAudioTranscription": {}, "systemInstruction": {"parts": [{"text": system_prompt}]}, } } await ws.send(json.dumps(setup_msg)) await ws.recv() log.info("Connected! Always listening...") self._audio_queue = asyncio.Queue() await asyncio.gather( self._capture_mic(ws), self._receive_audio(ws), self._play_audio(), ) except Exception as e: if self._running: log.error("Connection error: %s — reconnecting in 3s", e) await asyncio.sleep(3) def _voice_thread(self): asyncio.run(self._run_async()) # ─── START / STOP ───────────────────────────────────── def start(self): if self._running: return self._running = True self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice") self._thread.start() log.info("Gemini voice module started") def stop(self): self._running = False if self._thread: self._thread.join(timeout=5) self._thread = None log.info("Gemini voice module stopped") @property def is_running(self) -> bool: return self._running @property def state(self) -> str: return "LISTENING" if self._running else "STOPPED" @property def is_speaking(self) -> bool: return self.speaking # ─── STANDALONE TEST ───────────────────────────────────── if __name__ == "__main__": import sys sys.path.insert(0, PROJECT_ROOT) from API.audio_api import AudioAPI def on_transcript(text, role): print(f" [{role.upper()}] {text}") audio = AudioAPI() voice = GeminiVoiceModule(audio, on_transcript=on_transcript) print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n") voice.start() try: while voice.is_running: time.sleep(0.5) except KeyboardInterrupt: print("\nStopping...") voice.stop()