Marcus/Voice/marcus_gemini_voice.py
2026-04-12 18:50:22 +04:00

609 lines
23 KiB
Python

#!/usr/bin/env python3
"""
Voice/marcus_gemini_voice.py — Marcus Gemini Live Voice Module v2
==================================================================
Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio.
Uses G1 built-in speaker + Hollyland wireless mic.
Based on SanadVoice/gemini_interact architecture:
- PyAudio for mic (not parec)
- Echo suppression (silence when speaking)
- Gemini VAD (automatic activity detection)
- thinkingBudget=0 (no thinking text)
- ASR buffering for full sentences
- Vision routed to brain's Qwen camera
Usage:
from Voice.marcus_gemini_voice import GeminiVoiceModule
voice = GeminiVoiceModule(audio_api, on_transcript=callback)
voice.start()
"""
import array
import asyncio
import base64
import json
import logging
import os
import subprocess
import threading
import time
import numpy as np
from dotenv import load_dotenv
load_dotenv()
BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
PROJECT_NAME = "Marcus"
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
logging.StreamHandler(),
],
)
log = logging.getLogger("gemini_voice")
def load_config(name: str) -> dict:
path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
with open(path, "r") as f:
return json.load(f)
# ─── CONFIGURATION ────────────────────────────────────────
API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8"
MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
URI = (
"wss://generativelanguage.googleapis.com/ws/"
"google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
f"?key={API_KEY}"
)
VOICE_NAME = "Charon"
SEND_RATE = 16000
RECEIVE_RATE = 24000
CHUNK_SIZE = 512
CHANNELS = 1
def load_system_prompt():
paths = [
os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"),
]
for p in paths:
if os.path.exists(p):
with open(p, "r", encoding="utf-8-sig") as f:
return f.read().strip()
return (
"You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. "
"Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max."
)
# ─── AUDIO HELPERS ────────────────────────────────────────
def audio_energy(pcm: bytes) -> int:
try:
samples = array.array("h", pcm)
if not samples:
return 0
return sum(abs(s) for s in samples) // len(samples)
except Exception:
return 0
SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2)
# ─── GEMINI VOICE MODULE ─────────────────────────────────
class GeminiVoiceModule:
"""Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic."""
def __init__(self, audio_api, on_transcript=None):
self._audio = audio_api
self._on_transcript = on_transcript
self._config = load_config("Voice")
self._mic_source = getattr(audio_api, '_mic_source',
self._config["mic"].get("source_index", "0"))
# State
self.speaking = False
self.interrupted = False
self._running = False
self._thread = None
self._audio_queue = None # Created in async context
# Tuning
self.MIN_THRESHOLD = 3000
self.barge_in_threshold = self.MIN_THRESHOLD
self.REQUIRED_LOUD_CHUNKS = 10
self.PREBUFFER_CHUNKS = 2
self.PLAYBACK_TIMEOUT = 0.25
self.BARGE_IN_COOLDOWN = 0.7
self.AI_SPEAK_GRACE = 0.20
self.ECHO_GUARD_SEC = 0.8
self.SPEAKING_ENERGY_GATE = 0.85
self.SEND_SILENCE_WHEN_SPEAKING = True
# Timing
self._ai_speaking_since = 0.0
self._last_ai_audio_time = 0.0
self._barge_in_block_until = 0.0
self._ignore_input_until = 0.0
# ASR buffer
self._asr_buf = ""
self._asr_last_time = 0.0
self.ASR_WINDOW_SEC = 2.0
# Find Hollyland mic PyAudio device index
self._mic_device_idx = self._find_mic_device()
log.info("GeminiVoiceModule v2 initialized")
# ─── MIC DEVICE DETECTION ─────────────────────────────
def _find_mic_device(self) -> int:
"""Find Hollyland wireless mic in PyAudio devices. Returns device index."""
import pyaudio
import ctypes
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
def _alsa_error_handler(filename, line, function, err, fmt):
pass # suppress
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
try:
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
asound.snd_lib_error_set_handler(c_error_handler)
except: pass # ALSA_suppress
pa = pyaudio.PyAudio()
try:
# First: set PulseAudio default source to Hollyland
subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
# Search for wireless mic by name
for i in range(pa.get_device_count()):
info = pa.get_device_info_by_index(i)
name = info.get("name", "").lower()
if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name):
log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"]))
return i
# Fallback to 'default' or 'pulse' device
for i in range(pa.get_device_count()):
info = pa.get_device_info_by_index(i)
if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"):
log.info("Mic fallback: [%d] %s", i, info["name"])
return i
log.warning("No mic found, using device 0")
return 0
finally:
pa.terminate()
# ─── MIC CALIBRATION ──────────────────────────────────
def _calibrate_mic(self):
"""Calibrate barge-in threshold from ambient noise."""
import pyaudio
import ctypes
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
def _alsa_error_handler(filename, line, function, err, fmt):
pass # suppress
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
try:
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
asound.snd_lib_error_set_handler(c_error_handler)
except: pass # ALSA_suppress
pa = pyaudio.PyAudio()
mic_info = pa.get_device_info_by_index(self._mic_device_idx)
mic_rate = int(mic_info["defaultSampleRate"])
mic_channels = 1
try:
stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
rate=mic_rate, input=True,
input_device_index=self._mic_device_idx,
frames_per_buffer=CHUNK_SIZE)
values = []
for _ in range(40):
data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
values.append(audio_energy(data))
stream.stop_stream()
stream.close()
avg_noise = sum(values) / len(values) if values else 0
self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold)
except Exception as e:
log.warning("Calibration failed: %s", e)
finally:
pa.terminate()
# ─── G1 SPEAKER PLAYBACK ─────────────────────────────
def _play_buffer_on_g1(self, pcm_24k: np.ndarray):
"""Play 24kHz audio on G1 speaker (resample to 16kHz, single call)."""
if len(pcm_24k) < 100:
return
# Resample 24kHz → 16kHz
tl = int(len(pcm_24k) * 16000 / 24000)
audio_16k = np.interp(
np.linspace(0, len(pcm_24k), tl, endpoint=False),
np.arange(len(pcm_24k)),
pcm_24k.astype(np.float64),
).astype(np.int16)
from unitree_sdk2py.g1.audio.g1_audio_api import (
ROBOT_API_ID_AUDIO_START_PLAY,
ROBOT_API_ID_AUDIO_STOP_PLAY,
)
client = self._audio._client
if not client:
return
app_name = "gemini"
client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
time.sleep(0.1)
pcm = audio_16k.tobytes()
sid = f"s_{int(time.time() * 1000)}"
param = json.dumps({
"app_name": app_name,
"stream_id": sid,
"sample_rate": 16000,
"channels": 1,
"bits_per_sample": 16,
})
client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm))
duration = len(audio_16k) / 16000
time.sleep(duration + 0.3)
client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
# ─── WEBSOCKET TASKS ─────────────────────────────────
async def _capture_mic(self, ws):
"""Continuously capture mic via PyAudio and send to Gemini."""
import pyaudio
import ctypes
ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
def _alsa_error_handler(filename, line, function, err, fmt):
pass # suppress
c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
try:
asound = ctypes.cdll.LoadLibrary("libasound.so.2")
asound.snd_lib_error_set_handler(c_error_handler)
except: pass # ALSA_suppress
pa = pyaudio.PyAudio()
mic_info = pa.get_device_info_by_index(self._mic_device_idx)
mic_rate = int(mic_info["defaultSampleRate"])
mic_channels = 1
# Open mic at native rate/channels
stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
rate=mic_rate, input=True,
input_device_index=self._mic_device_idx,
frames_per_buffer=CHUNK_SIZE)
log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels)
loud_chunks = 0
loop = asyncio.get_event_loop()
needs_resample = mic_rate != SEND_RATE or mic_channels != 1
try:
while self._running:
data = await loop.run_in_executor(
None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False))
# Convert to mono 16kHz if needed
if needs_resample:
audio = np.frombuffer(data, dtype=np.int16)
# Stereo to mono
if mic_channels == 2:
audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16)
# Resample to 16kHz
if mic_rate != SEND_RATE:
tl = int(len(audio) * SEND_RATE / mic_rate)
if tl > 0:
audio = np.interp(
np.linspace(0, len(audio), tl, endpoint=False),
np.arange(len(audio)),
audio.astype(np.float64),
).astype(np.int16)
data = audio.tobytes()
energy = audio_energy(data)
now = time.time()
# Barge-in detection
if self.speaking and now >= self._barge_in_block_until:
if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
if energy > self.barge_in_threshold:
loud_chunks += 1
else:
loud_chunks = 0
if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
log.info("Barge-in detected!")
self.interrupted = True
self.speaking = False
while not self._audio_queue.empty():
try: self._audio_queue.get_nowait()
except: break
loud_chunks = 0
self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN
# Echo suppression: send silence while speaking
data_to_send = data
if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
if energy < gate:
data_to_send = SILENCE_PCM
# Send to Gemini
b64 = base64.b64encode(data_to_send).decode()
msg = {
"realtime_input": {
"media_chunks": [
{"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64}
]
}
}
await ws.send(json.dumps(msg))
except Exception as e:
if self._running:
log.error("Mic error: %s", e)
finally:
stream.stop_stream()
stream.close()
pa.terminate()
async def _receive_audio(self, ws):
"""Receive audio responses and transcriptions from Gemini."""
async for msg in ws:
if not self._running:
break
try:
response = json.loads(msg)
server_content = response.get("serverContent", {})
if server_content.get("interrupted"):
self.interrupted = False
# User transcription (partial/streaming)
input_tr = (
server_content.get("inputTranscription")
or server_content.get("input_transcription")
or server_content.get("inputAudioTranscription")
or server_content.get("input_audio_transcription")
)
if isinstance(input_tr, dict):
text = (input_tr.get("text") or "").strip()
now = time.time()
if text and now >= self._ignore_input_until and not self.speaking:
# Buffer ASR text
if now - self._asr_last_time > self.ASR_WINDOW_SEC:
self._asr_buf = ""
self._asr_buf = text # Gemini sends cumulative transcription
self._asr_last_time = now
if self.interrupted:
continue
# Audio from Gemini
model_turn = server_content.get("modelTurn")
if model_turn:
for part in model_turn.get("parts", []):
inline_data = part.get("inlineData")
if inline_data:
audio_b64 = inline_data.get("data")
if audio_b64:
now = time.time()
if not self.speaking:
self._ai_speaking_since = now
# Gemini started responding — fire transcript callback
if self._asr_buf and self._on_transcript:
self._on_transcript(self._asr_buf, "user")
self.speaking = True
self._last_ai_audio_time = now
self._ignore_input_until = now + self.ECHO_GUARD_SEC
audio_bytes = base64.b64decode(audio_b64)
await self._audio_queue.put(audio_bytes)
# Text from Gemini (thinking/response text)
text_part = part.get("text", "").strip()
if text_part and self._on_transcript:
self._on_transcript(text_part, "marcus")
# Turn complete — Gemini finished speaking
turn_complete = server_content.get("turnComplete")
if turn_complete:
# Clear ASR buffer after turn
self._asr_buf = ""
except Exception as e:
log.error("Receive error: %s", e)
async def _play_audio(self):
"""Collect Gemini audio chunks and play on G1 speaker."""
while self._running:
try:
if not self.speaking:
await asyncio.sleep(0.05)
continue
# Pre-buffer
buffered = False
while self.speaking and not buffered:
if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS:
buffered = True
else:
await asyncio.sleep(0.01)
# Collect all audio chunks
buffer_chunks = []
while self.speaking:
try:
data = await asyncio.wait_for(
self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT)
audio = np.frombuffer(data, dtype=np.int16)
buffer_chunks.append(audio)
self._last_ai_audio_time = time.time()
except asyncio.TimeoutError:
if self._audio_queue.empty():
if time.time() - self._last_ai_audio_time > 0.3:
break
# Play on G1 speaker
if buffer_chunks:
full_audio = np.concatenate(buffer_chunks)
duration = len(full_audio) / RECEIVE_RATE
log.info("Playing %.1fs on G1", duration)
await asyncio.get_event_loop().run_in_executor(
None, self._play_buffer_on_g1, full_audio)
self.speaking = False
except Exception as e:
log.error("Play error: %s", e)
self.speaking = False
# ─── MAIN LOOP ────────────────────────────────────────
async def _run_async(self):
import websockets
import inspect
system_prompt = load_system_prompt()
# Unmute mic
subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)
# Calibrate
self._calibrate_mic()
ws_kwargs = {"max_size": None}
try:
sig = inspect.signature(websockets.connect)
if "extra_headers" in sig.parameters:
ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
else:
ws_kwargs["additional_headers"] = {"Content-Type": "application/json"}
except Exception:
ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
while self._running:
try:
log.info("Connecting to Gemini...")
async with websockets.connect(URI, **ws_kwargs) as ws:
setup_msg = {
"setup": {
"model": MODEL,
"generationConfig": {
"responseModalities": ["AUDIO"],
"thinkingConfig": {"thinkingBudget": 0},
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {"voiceName": VOICE_NAME}
}
},
},
"realtimeInputConfig": {
"automaticActivityDetection": {
"startOfSpeechSensitivity": "START_SENSITIVITY_HIGH",
"prefixPaddingMs": 40,
"endOfSpeechSensitivity": "END_SENSITIVITY_HIGH",
"silenceDurationMs": 250,
}
},
"inputAudioTranscription": {},
"systemInstruction": {"parts": [{"text": system_prompt}]},
}
}
await ws.send(json.dumps(setup_msg))
await ws.recv()
log.info("Connected! Always listening...")
self._audio_queue = asyncio.Queue()
await asyncio.gather(
self._capture_mic(ws),
self._receive_audio(ws),
self._play_audio(),
)
except Exception as e:
if self._running:
log.error("Connection error: %s — reconnecting in 3s", e)
await asyncio.sleep(3)
def _voice_thread(self):
asyncio.run(self._run_async())
# ─── START / STOP ─────────────────────────────────────
def start(self):
if self._running:
return
self._running = True
self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice")
self._thread.start()
log.info("Gemini voice module started")
def stop(self):
self._running = False
if self._thread:
self._thread.join(timeout=5)
self._thread = None
log.info("Gemini voice module stopped")
@property
def is_running(self) -> bool:
return self._running
@property
def state(self) -> str:
return "LISTENING" if self._running else "STOPPED"
@property
def is_speaking(self) -> bool:
return self.speaking
# ─── STANDALONE TEST ─────────────────────────────────────
if __name__ == "__main__":
import sys
sys.path.insert(0, PROJECT_ROOT)
from API.audio_api import AudioAPI
def on_transcript(text, role):
print(f" [{role.upper()}] {text}")
audio = AudioAPI()
voice = GeminiVoiceModule(audio, on_transcript=on_transcript)
print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n")
voice.start()
try:
while voice.is_running:
time.sleep(0.5)
except KeyboardInterrupt:
print("\nStopping...")
voice.stop()