#!/usr/bin/env python3 """Sanad voice subprocess — Gemini Live (google-genai SDK) on the G1. Mic/speaker are selected at startup via `SANAD_AUDIO_PROFILE` (builtin | anker | hollyland_builtin), materialised by `voice/audio_io.py`. The default ("builtin") is UDP multicast mic + AudioClient.PlayStream. Features: mic gain, echo suppression, barge-in, wait-for-user, streaming playback, per-turn WAV recording. Usage: python3 voice/sanad_voice.py eth0 python3 voice/sanad_voice.py eth0 --voice Charon SANAD_AUDIO_PROFILE=anker python3 voice/sanad_voice.py eth0 """ import array import asyncio import json import logging import os import sys import threading import time import wave from datetime import datetime from pathlib import Path import numpy as np from google import genai from google.genai import types from unitree_sdk2py.core.channel import ChannelFactoryInitialize from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient from Project.Sanad.voice.audio_io import AudioIO, Mic, Speaker # ─── LOGGING ───────────────────────────────────────────── try: from Project.Sanad.core.config_loader import section as _cfg_section_log _LOG_CFG = _cfg_section_log("voice", "sanad_voice") except Exception: _LOG_CFG = {} LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs")) os.makedirs(LOG_DIR, exist_ok=True) _LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2") LOG_FILE = os.path.join(LOG_DIR, f"{_LOG_NAME}_{datetime.now():%Y%m%d}.log") logging.basicConfig( level=logging.INFO, format="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s", datefmt="%H:%M:%S", handlers=[ logging.FileHandler(LOG_FILE), logging.StreamHandler(), ], ) log = logging.getLogger("gemini_v2") # ─── CONFIG — single source of truth ───────────────────── # # Gemini credentials + audio rates live in config/core_config.json # (exposed via config.py as GEMINI_API_KEY, GEMINI_MODEL, etc). # Voice-loop-specific tunables live in config/voice_config.json. try: from Project.Sanad.config import ( GEMINI_API_KEY, GEMINI_VOICE, SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE, ) from Project.Sanad.core.config_loader import section as _cfg_section _SV = _cfg_section("voice", "sanad_voice") _MIC = _cfg_section("voice", "mic_udp") _SP = _cfg_section("voice", "speaker") _REC = _cfg_section("voice", "recording") except Exception: GEMINI_API_KEY, GEMINI_VOICE = "", "Charon" SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE = 16000, 24000, 512 _SV, _MIC, _SP, _REC = {}, {}, {}, {} API_KEY = GEMINI_API_KEY # Gemini Live model name (without "models/" prefix expected by google-genai SDK) MODEL = os.environ.get("SANAD_GEMINI_MODEL", "gemini-2.5-flash-native-audio-preview-12-2025") VOICE_NAME = GEMINI_VOICE SEND_RATE = SEND_SAMPLE_RATE RECEIVE_RATE = RECEIVE_SAMPLE_RATE CHUNK_SAMPLES = CHUNK_SIZE MIC_GAIN = _SV.get("mic_gain", 1.0) PLAY_CHUNK = _SV.get("play_chunk_bytes", 96000) SILENCE_PCM = b'\x00' * (CHUNK_SAMPLES * 2) # ─── RECORDING ─────────────────────────────────────────── RECORD_ENABLED = os.environ.get("SANAD_RECORD", "1" if _REC.get("enabled", True) else "0") != "0" _rec_dir_rel = _REC.get("dir_relative", "data/recordings") RECORD_DIR = Path( os.environ.get( "SANAD_RECORD_DIR", str(Path(__file__).resolve().parent.parent / _rec_dir_rel), ) ) SYSTEM_PROMPT = ( "You are Marcus, a bilingual humanoid robot assistant made by YS Lootah Technology, Dubai, UAE. " "RESPOND IN ARABIC (Gulf/Emirati dialect) OR ENGLISH ONLY. " "YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE USER SPEAKS. " "If the user speaks Arabic, you MUST reply in Arabic Gulf dialect. " "If the user speaks English, you MUST reply in English. " "Do NOT confuse Arabic with Japanese, Hindi, Russian, or any other language. " "The user is speaking Arabic or English — nothing else. " "Be concise — 1 to 2 sentences max. Be friendly and natural. " "If the user interrupts and says 'continue' or 'كمل', resume EXACTLY where you stopped. " "Only respond to clear human speech. Ignore background noise and silence completely. " "Do not respond to sounds that are not words." ) # ─── HELPERS ───────────────────────────────────────────── def audio_energy(pcm: bytes) -> int: try: samples = array.array("h", pcm) return sum(abs(s) for s in samples) // len(samples) if samples else 0 except Exception: return 0 # ─── TURN RECORDER ────────────────────────────────────── class TurnRecorder: """Saves each turn as two WAV files: user mic + Gemini output. A turn starts when user audio starts flowing through `capture_user` and ends on `turn_complete`. Files land in SANAD_RECORD_DIR as `_user.wav` (16 kHz) and `_robot.wav` (24 kHz). An `index.json` maintains a list of all turns with metadata (timestamp, text transcripts, durations) so the dashboard can browse them later. """ def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR): self.enabled = enabled self.out_dir = out_dir if self.enabled: self.out_dir.mkdir(parents=True, exist_ok=True) self._lock = threading.Lock() self._user_buf: list[bytes] = [] self._robot_buf: list[bytes] = [] self._user_text = "" self._robot_text = "" self._started_at: float = 0.0 def capture_user(self, pcm: bytes) -> None: if not self.enabled or not pcm: return with self._lock: if not self._user_buf and not self._robot_buf: self._started_at = time.time() self._user_buf.append(pcm) def capture_robot(self, pcm: bytes) -> None: if not self.enabled or not pcm: return with self._lock: if not self._user_buf and not self._robot_buf: self._started_at = time.time() self._robot_buf.append(pcm) def add_user_text(self, text: str) -> None: if text and self.enabled: with self._lock: self._user_text = (self._user_text + " " + text).strip() def add_robot_text(self, text: str) -> None: if text and self.enabled: with self._lock: self._robot_text = (self._robot_text + " " + text).strip() def finish_turn(self) -> dict: """Save current buffers to disk, reset state, return metadata.""" if not self.enabled: return {} with self._lock: user_data = b"".join(self._user_buf) robot_data = b"".join(self._robot_buf) user_text = self._user_text robot_text = self._robot_text started_at = self._started_at self._user_buf.clear() self._robot_buf.clear() self._user_text = "" self._robot_text = "" if not user_data and not robot_data: return {} stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S") entry = {"timestamp": stamp, "started_at": started_at, "user_text": user_text, "robot_text": robot_text} try: if user_data: user_path = self.out_dir / f"{stamp}_user.wav" self._save_wav(user_path, user_data, SEND_RATE) entry["user_wav"] = str(user_path) entry["user_duration_sec"] = round(len(user_data) / (SEND_RATE * 2), 3) if robot_data: robot_path = self.out_dir / f"{stamp}_robot.wav" self._save_wav(robot_path, robot_data, RECEIVE_RATE) entry["robot_wav"] = str(robot_path) entry["robot_duration_sec"] = round(len(robot_data) / (RECEIVE_RATE * 2), 3) self._append_index(entry) log.info("recorded turn → %s (user %.1fs, robot %.1fs)", stamp, entry.get("user_duration_sec", 0), entry.get("robot_duration_sec", 0)) except Exception as exc: log.warning("recording save failed: %s", exc) return entry def _save_wav(self, path: Path, pcm: bytes, rate: int) -> None: with wave.open(str(path), "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(rate) wf.writeframes(pcm) def _append_index(self, entry: dict) -> None: idx_path = self.out_dir / "index.json" try: if idx_path.exists(): payload = json.loads(idx_path.read_text(encoding="utf-8")) if not isinstance(payload, dict): payload = {"records": []} else: payload = {"records": []} except Exception: payload = {"records": []} payload.setdefault("records", []).append(entry) payload["total_records"] = len(payload["records"]) idx_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8") # Mic + speaker classes now live in voice/audio_io.py — built via # AudioIO.from_profile() in main(). Selected with SANAD_AUDIO_PROFILE # (builtin | anker | hollyland_builtin). # ─── SESSION ───────────────────────────────────────────── async def run_session(mic: Mic, speaker: Speaker, voice: str): client = genai.Client(api_key=API_KEY) recorder = TurnRecorder(enabled=RECORD_ENABLED) if RECORD_ENABLED: log.info("recording enabled → %s", RECORD_DIR) config = types.LiveConnectConfig( response_modalities=["AUDIO"], speech_config=types.SpeechConfig( voice_config=types.VoiceConfig( prebuilt_voice_config=types.PrebuiltVoiceConfig( voice_name=voice ) ) ), realtime_input_config=types.RealtimeInputConfig( automatic_activity_detection=types.AutomaticActivityDetection( disabled=False, start_of_speech_sensitivity=getattr( types.StartSensitivity, _cfg_section("voice", "vad").get( "start_sensitivity", "START_SENSITIVITY_HIGH")), end_of_speech_sensitivity=getattr( types.EndSensitivity, _cfg_section("voice", "vad").get( "end_sensitivity", "END_SENSITIVITY_LOW")), prefix_padding_ms=_cfg_section("voice", "vad").get("prefix_padding_ms", 20), silence_duration_ms=_cfg_section("voice", "vad").get("silence_duration_ms", 200), ) ), input_audio_transcription=types.AudioTranscriptionConfig(), output_audio_transcription=types.AudioTranscriptionConfig(), system_instruction=types.Content( parts=[types.Part(text=SYSTEM_PROMPT)] ), ) session_num = 0 start_time = time.time() consecutive_errors = 0 while True: session_num += 1 speaking = False stream_started = False barge_block_until = 0.0 ai_speak_start = 0.0 last_ai_audio = 0.0 _bi = _cfg_section("voice", "barge_in") BARGE_THRESHOLD = _bi.get("threshold", 500) LOUD_CHUNKS_NEEDED = _bi.get("loud_chunks_needed", 3) BARGE_COOLDOWN = _bi.get("cooldown_sec", 0.3) ECHO_SUPPRESS_BELOW = _bi.get("echo_suppress_below", 500) AI_SPEAK_GRACE_SEC = _bi.get("ai_speak_grace_sec", 0.15) uptime_min = (time.time() - start_time) / 60 try: log.info("connecting to Gemini (session #%d, uptime %.0fm)...", session_num, uptime_min) async with client.aio.live.connect(model=MODEL, config=config) as session: log.info("connected — speak anytime!") consecutive_errors = 0 # reset on successful connect mic.flush() done = asyncio.Event() # ── Send mic ── async def send_mic(): nonlocal speaking, barge_block_until chunk_bytes = CHUNK_SAMPLES * 2 loud_count = 0 last_activity = time.time() loop = asyncio.get_event_loop() while not done.is_set(): try: raw = await loop.run_in_executor( None, lambda: mic.read_chunk(chunk_bytes)) except Exception: break # Amplify samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) samples = np.clip(samples * MIC_GAIN, -32768, 32767).astype(np.int16) data = samples.tobytes() energy = audio_energy(data) now = time.time() # Barge-in if speaking and now >= barge_block_until: if (now - ai_speak_start) >= AI_SPEAK_GRACE_SEC: if energy > BARGE_THRESHOLD: loud_count += 1 else: loud_count = max(0, loud_count - 1) if loud_count > LOUD_CHUNKS_NEEDED: log.info("BARGE-IN (e=%d)", energy) do_interrupt("barge-in") loud_count = 0 barge_block_until = now + BARGE_COOLDOWN # Echo suppression send_data = data if speaking and energy < ECHO_SUPPRESS_BELOW: send_data = SILENCE_PCM[:chunk_bytes] # Record user audio (only when clearly speaking, # energy > 250 — skip ambient silence noise) if energy > 250 and not speaking: recorder.capture_user(data) # Watchdog if energy > 250: last_activity = now elif now - last_activity > 10: log.info("alive (no speech %.0fs, e=%d, buf=%d)", now - last_activity, energy, len(mic._buf)) last_activity = now try: await session.send_realtime_input( audio=types.Blob( data=send_data, mime_type=f"audio/pcm;rate={SEND_RATE}" ) ) except asyncio.CancelledError: return except Exception as e: log.warning("mic send failed: %s — ending session", e) done.set() return await asyncio.sleep(CHUNK_SAMPLES / SEND_RATE) log.info("send_mic task ended") # ── Interrupt helper ── def do_interrupt(source="local"): nonlocal speaking, stream_started speaking = False stream_started = False speaker.stop() mic.flush() recorder.finish_turn() # ── Receive ── async def receive(): nonlocal speaking, stream_started nonlocal ai_speak_start, last_ai_audio loop = asyncio.get_event_loop() try: last_recv = time.time() while not done.is_set(): async for response in session.receive(): last_recv = time.time() if done.is_set(): break # Server going away — reconnect soon if hasattr(response, 'go_away') and response.go_away is not None: log.info("server going away — will reconnect") done.set() return sc = response.server_content if sc is None: continue # Gemini interrupted if sc.interrupted is True: if speaking: log.info("Gemini interrupted") do_interrupt("gemini") continue # User transcript if sc.input_transcription: text = (sc.input_transcription.text or "").strip() if text and not speaking: log.info("USER: %s", text) recorder.add_user_text(text) # Marcus transcript if sc.output_transcription: text = (sc.output_transcription.text or "").strip() if text: log.info("MARCUS: %s", text) recorder.add_robot_text(text) # AI audio if sc.model_turn: for part in sc.model_turn.parts: if part.inline_data and part.inline_data.data: now = time.time() if not speaking: ai_speak_start = now speaking = True last_ai_audio = now raw_audio = part.inline_data.data recorder.capture_robot(raw_audio) audio = np.frombuffer( raw_audio, dtype=np.int16) if not stream_started: await loop.run_in_executor( None, speaker.begin_stream) stream_started = True await loop.run_in_executor( None, speaker.send_chunk, audio, RECEIVE_RATE) # Turn complete if sc.turn_complete: if speaking and stream_started and not speaker.interrupted: dur = speaker.total_sent_sec log.info("speaker %.1fs", dur) await loop.run_in_executor( None, speaker.wait_finish) elif speaking and speaker.interrupted: log.info("speaker interrupted") speaking = False stream_started = False mic.flush() recorder.finish_turn() log.info("listening") # receive() iterator ended — check if session is still alive if time.time() - last_recv > 30: log.warning("no messages from Gemini for 30s — session dead") break await asyncio.sleep(0.1) except Exception as e: log.warning("receive ended: %s", e) finally: done.set() try: await asyncio.wait_for( asyncio.gather(send_mic(), receive()), timeout=_SV.get("session_timeout_sec", 660), # 11 min max (server go_away at ~10 min) ) except asyncio.TimeoutError: log.warning("session timed out after 11 min") except asyncio.CancelledError: log.warning("session cancelled") log.info("session #%d ended — reconnecting in 1s", session_num) speaker.stop() mic.flush() await asyncio.sleep(1) except asyncio.CancelledError: log.info("cancelled — stopping") break except KeyboardInterrupt: log.info("keyboard interrupt — stopping") break except Exception as e: consecutive_errors += 1 # Exponential backoff: 2s, 4s, 8s, 16s, max 30s delay = min(30, 2 ** consecutive_errors) log.error("session error (#%d): %s — reconnecting in %ds", consecutive_errors, e, delay) await asyncio.sleep(delay) # After 10 consecutive errors, restart the client if consecutive_errors >= 10: log.warning("10 consecutive errors — recreating client") try: client = genai.Client(api_key=API_KEY) consecutive_errors = 0 except Exception as ce: log.error("client recreation failed: %s", ce) # ─── MAIN ──────────────────────────────────────────────── def main(): if len(sys.argv) < 2: print(__doc__) sys.exit(1) iface = sys.argv[1] voice = VOICE_NAME if "--voice" in sys.argv: idx = sys.argv.index("--voice") voice = sys.argv[idx + 1] log.info("DDS on %s", iface) ChannelFactoryInitialize(0, iface) ac = AudioClient() ac.SetTimeout(10.0) ac.Init() log.info("AudioClient ready") profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin") audio = AudioIO.from_profile(profile, audio_client=ac) audio.start() mic, speaker = audio.mic, audio.speaker log.info("audio profile=%s", audio.profile_id) log.info("testing mic 2s...") time.sleep(2) test = mic.read_chunk(1024) e = audio_energy(test) log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT") log.info("voice=%s log=%s", voice, LOG_FILE) log.info("─" * 50) try: asyncio.run(run_session(mic, speaker, voice)) except KeyboardInterrupt: pass except Exception as e: log.error("fatal: %s", e) finally: log.info("stopped") audio.stop() if __name__ == "__main__": main()