Sanad/voice/sanad_voice.py

582 lines
24 KiB
Python

#!/usr/bin/env python3
"""Sanad voice subprocess — Gemini Live (google-genai SDK) on the G1.
Mic/speaker are selected at startup via `SANAD_AUDIO_PROFILE` (builtin |
anker | hollyland_builtin), materialised by `voice/audio_io.py`. The
default ("builtin") is UDP multicast mic + AudioClient.PlayStream.
Features: mic gain, echo suppression, barge-in, wait-for-user,
streaming playback, per-turn WAV recording.
Usage:
python3 voice/sanad_voice.py eth0
python3 voice/sanad_voice.py eth0 --voice Charon
SANAD_AUDIO_PROFILE=anker python3 voice/sanad_voice.py eth0
"""
import array
import asyncio
import json
import logging
import os
import sys
import threading
import time
import wave
from datetime import datetime
from pathlib import Path
import numpy as np
from google import genai
from google.genai import types
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
from Project.Sanad.voice.audio_io import AudioIO, Mic, Speaker
# ─── LOGGING ─────────────────────────────────────────────
try:
from Project.Sanad.core.config_loader import section as _cfg_section_log
_LOG_CFG = _cfg_section_log("voice", "sanad_voice")
except Exception:
_LOG_CFG = {}
LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs"))
os.makedirs(LOG_DIR, exist_ok=True)
_LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2")
LOG_FILE = os.path.join(LOG_DIR, f"{_LOG_NAME}_{datetime.now():%Y%m%d}.log")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
handlers=[
logging.FileHandler(LOG_FILE),
logging.StreamHandler(),
],
)
log = logging.getLogger("gemini_v2")
# ─── CONFIG — single source of truth ─────────────────────
#
# Gemini credentials + audio rates live in config/core_config.json
# (exposed via config.py as GEMINI_API_KEY, GEMINI_MODEL, etc).
# Voice-loop-specific tunables live in config/voice_config.json.
try:
from Project.Sanad.config import (
GEMINI_API_KEY, GEMINI_VOICE,
SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE,
)
from Project.Sanad.core.config_loader import section as _cfg_section
_SV = _cfg_section("voice", "sanad_voice")
_MIC = _cfg_section("voice", "mic_udp")
_SP = _cfg_section("voice", "speaker")
_REC = _cfg_section("voice", "recording")
except Exception:
GEMINI_API_KEY, GEMINI_VOICE = "", "Charon"
SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE = 16000, 24000, 512
_SV, _MIC, _SP, _REC = {}, {}, {}, {}
API_KEY = GEMINI_API_KEY
# Gemini Live model name (without "models/" prefix expected by google-genai SDK)
MODEL = os.environ.get("SANAD_GEMINI_MODEL",
"gemini-2.5-flash-native-audio-preview-12-2025")
VOICE_NAME = GEMINI_VOICE
SEND_RATE = SEND_SAMPLE_RATE
RECEIVE_RATE = RECEIVE_SAMPLE_RATE
CHUNK_SAMPLES = CHUNK_SIZE
MIC_GAIN = _SV.get("mic_gain", 1.0)
PLAY_CHUNK = _SV.get("play_chunk_bytes", 96000)
SILENCE_PCM = b'\x00' * (CHUNK_SAMPLES * 2)
# ─── RECORDING ───────────────────────────────────────────
RECORD_ENABLED = os.environ.get("SANAD_RECORD",
"1" if _REC.get("enabled", True) else "0") != "0"
_rec_dir_rel = _REC.get("dir_relative", "data/recordings")
RECORD_DIR = Path(
os.environ.get(
"SANAD_RECORD_DIR",
str(Path(__file__).resolve().parent.parent / _rec_dir_rel),
)
)
SYSTEM_PROMPT = (
"You are Marcus, a bilingual humanoid robot assistant made by YS Lootah Technology, Dubai, UAE. "
"RESPOND IN ARABIC (Gulf/Emirati dialect) OR ENGLISH ONLY. "
"YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE USER SPEAKS. "
"If the user speaks Arabic, you MUST reply in Arabic Gulf dialect. "
"If the user speaks English, you MUST reply in English. "
"Do NOT confuse Arabic with Japanese, Hindi, Russian, or any other language. "
"The user is speaking Arabic or English — nothing else. "
"Be concise — 1 to 2 sentences max. Be friendly and natural. "
"If the user interrupts and says 'continue' or 'كمل', resume EXACTLY where you stopped. "
"Only respond to clear human speech. Ignore background noise and silence completely. "
"Do not respond to sounds that are not words."
)
# ─── HELPERS ─────────────────────────────────────────────
def audio_energy(pcm: bytes) -> int:
try:
samples = array.array("h", pcm)
return sum(abs(s) for s in samples) // len(samples) if samples else 0
except Exception:
return 0
# ─── TURN RECORDER ──────────────────────────────────────
class TurnRecorder:
"""Saves each turn as two WAV files: user mic + Gemini output.
A turn starts when user audio starts flowing through `capture_user`
and ends on `turn_complete`. Files land in SANAD_RECORD_DIR as
`<timestamp>_user.wav` (16 kHz) and `<timestamp>_robot.wav` (24 kHz).
An `index.json` maintains a list of all turns with metadata
(timestamp, text transcripts, durations) so the dashboard can
browse them later.
"""
def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR):
self.enabled = enabled
self.out_dir = out_dir
if self.enabled:
self.out_dir.mkdir(parents=True, exist_ok=True)
self._lock = threading.Lock()
self._user_buf: list[bytes] = []
self._robot_buf: list[bytes] = []
self._user_text = ""
self._robot_text = ""
self._started_at: float = 0.0
def capture_user(self, pcm: bytes) -> None:
if not self.enabled or not pcm:
return
with self._lock:
if not self._user_buf and not self._robot_buf:
self._started_at = time.time()
self._user_buf.append(pcm)
def capture_robot(self, pcm: bytes) -> None:
if not self.enabled or not pcm:
return
with self._lock:
if not self._user_buf and not self._robot_buf:
self._started_at = time.time()
self._robot_buf.append(pcm)
def add_user_text(self, text: str) -> None:
if text and self.enabled:
with self._lock:
self._user_text = (self._user_text + " " + text).strip()
def add_robot_text(self, text: str) -> None:
if text and self.enabled:
with self._lock:
self._robot_text = (self._robot_text + " " + text).strip()
def finish_turn(self) -> dict:
"""Save current buffers to disk, reset state, return metadata."""
if not self.enabled:
return {}
with self._lock:
user_data = b"".join(self._user_buf)
robot_data = b"".join(self._robot_buf)
user_text = self._user_text
robot_text = self._robot_text
started_at = self._started_at
self._user_buf.clear()
self._robot_buf.clear()
self._user_text = ""
self._robot_text = ""
if not user_data and not robot_data:
return {}
stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S")
entry = {"timestamp": stamp, "started_at": started_at,
"user_text": user_text, "robot_text": robot_text}
try:
if user_data:
user_path = self.out_dir / f"{stamp}_user.wav"
self._save_wav(user_path, user_data, SEND_RATE)
entry["user_wav"] = str(user_path)
entry["user_duration_sec"] = round(len(user_data) / (SEND_RATE * 2), 3)
if robot_data:
robot_path = self.out_dir / f"{stamp}_robot.wav"
self._save_wav(robot_path, robot_data, RECEIVE_RATE)
entry["robot_wav"] = str(robot_path)
entry["robot_duration_sec"] = round(len(robot_data) / (RECEIVE_RATE * 2), 3)
self._append_index(entry)
log.info("recorded turn → %s (user %.1fs, robot %.1fs)",
stamp,
entry.get("user_duration_sec", 0),
entry.get("robot_duration_sec", 0))
except Exception as exc:
log.warning("recording save failed: %s", exc)
return entry
def _save_wav(self, path: Path, pcm: bytes, rate: int) -> None:
with wave.open(str(path), "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(rate)
wf.writeframes(pcm)
def _append_index(self, entry: dict) -> None:
idx_path = self.out_dir / "index.json"
try:
if idx_path.exists():
payload = json.loads(idx_path.read_text(encoding="utf-8"))
if not isinstance(payload, dict):
payload = {"records": []}
else:
payload = {"records": []}
except Exception:
payload = {"records": []}
payload.setdefault("records", []).append(entry)
payload["total_records"] = len(payload["records"])
idx_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
encoding="utf-8")
# Mic + speaker classes now live in voice/audio_io.py — built via
# AudioIO.from_profile() in main(). Selected with SANAD_AUDIO_PROFILE
# (builtin | anker | hollyland_builtin).
# ─── SESSION ─────────────────────────────────────────────
async def run_session(mic: Mic, speaker: Speaker, voice: str):
client = genai.Client(api_key=API_KEY)
recorder = TurnRecorder(enabled=RECORD_ENABLED)
if RECORD_ENABLED:
log.info("recording enabled → %s", RECORD_DIR)
config = types.LiveConnectConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice
)
)
),
realtime_input_config=types.RealtimeInputConfig(
automatic_activity_detection=types.AutomaticActivityDetection(
disabled=False,
start_of_speech_sensitivity=getattr(
types.StartSensitivity,
_cfg_section("voice", "vad").get(
"start_sensitivity", "START_SENSITIVITY_HIGH")),
end_of_speech_sensitivity=getattr(
types.EndSensitivity,
_cfg_section("voice", "vad").get(
"end_sensitivity", "END_SENSITIVITY_LOW")),
prefix_padding_ms=_cfg_section("voice", "vad").get("prefix_padding_ms", 20),
silence_duration_ms=_cfg_section("voice", "vad").get("silence_duration_ms", 200),
)
),
input_audio_transcription=types.AudioTranscriptionConfig(),
output_audio_transcription=types.AudioTranscriptionConfig(),
system_instruction=types.Content(
parts=[types.Part(text=SYSTEM_PROMPT)]
),
)
session_num = 0
start_time = time.time()
consecutive_errors = 0
while True:
session_num += 1
speaking = False
stream_started = False
barge_block_until = 0.0
ai_speak_start = 0.0
last_ai_audio = 0.0
_bi = _cfg_section("voice", "barge_in")
BARGE_THRESHOLD = _bi.get("threshold", 500)
LOUD_CHUNKS_NEEDED = _bi.get("loud_chunks_needed", 3)
BARGE_COOLDOWN = _bi.get("cooldown_sec", 0.3)
ECHO_SUPPRESS_BELOW = _bi.get("echo_suppress_below", 500)
AI_SPEAK_GRACE_SEC = _bi.get("ai_speak_grace_sec", 0.15)
uptime_min = (time.time() - start_time) / 60
try:
log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
session_num, uptime_min)
async with client.aio.live.connect(model=MODEL, config=config) as session:
log.info("connected — speak anytime!")
consecutive_errors = 0 # reset on successful connect
mic.flush()
done = asyncio.Event()
# ── Send mic ──
async def send_mic():
nonlocal speaking, barge_block_until
chunk_bytes = CHUNK_SAMPLES * 2
loud_count = 0
last_activity = time.time()
loop = asyncio.get_event_loop()
while not done.is_set():
try:
raw = await loop.run_in_executor(
None, lambda: mic.read_chunk(chunk_bytes))
except Exception:
break
# Amplify
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
samples = np.clip(samples * MIC_GAIN, -32768, 32767).astype(np.int16)
data = samples.tobytes()
energy = audio_energy(data)
now = time.time()
# Barge-in
if speaking and now >= barge_block_until:
if (now - ai_speak_start) >= AI_SPEAK_GRACE_SEC:
if energy > BARGE_THRESHOLD:
loud_count += 1
else:
loud_count = max(0, loud_count - 1)
if loud_count > LOUD_CHUNKS_NEEDED:
log.info("BARGE-IN (e=%d)", energy)
do_interrupt("barge-in")
loud_count = 0
barge_block_until = now + BARGE_COOLDOWN
# Echo suppression
send_data = data
if speaking and energy < ECHO_SUPPRESS_BELOW:
send_data = SILENCE_PCM[:chunk_bytes]
# Record user audio (only when clearly speaking,
# energy > 250 — skip ambient silence noise)
if energy > 250 and not speaking:
recorder.capture_user(data)
# Watchdog
if energy > 250:
last_activity = now
elif now - last_activity > 10:
log.info("alive (no speech %.0fs, e=%d, buf=%d)",
now - last_activity, energy, len(mic._buf))
last_activity = now
try:
await session.send_realtime_input(
audio=types.Blob(
data=send_data,
mime_type=f"audio/pcm;rate={SEND_RATE}"
)
)
except asyncio.CancelledError:
return
except Exception as e:
log.warning("mic send failed: %s — ending session", e)
done.set()
return
await asyncio.sleep(CHUNK_SAMPLES / SEND_RATE)
log.info("send_mic task ended")
# ── Interrupt helper ──
def do_interrupt(source="local"):
nonlocal speaking, stream_started
speaking = False
stream_started = False
speaker.stop()
mic.flush()
recorder.finish_turn()
# ── Receive ──
async def receive():
nonlocal speaking, stream_started
nonlocal ai_speak_start, last_ai_audio
loop = asyncio.get_event_loop()
try:
last_recv = time.time()
while not done.is_set():
async for response in session.receive():
last_recv = time.time()
if done.is_set():
break
# Server going away — reconnect soon
if hasattr(response, 'go_away') and response.go_away is not None:
log.info("server going away — will reconnect")
done.set()
return
sc = response.server_content
if sc is None:
continue
# Gemini interrupted
if sc.interrupted is True:
if speaking:
log.info("Gemini interrupted")
do_interrupt("gemini")
continue
# User transcript
if sc.input_transcription:
text = (sc.input_transcription.text or "").strip()
if text and not speaking:
log.info("USER: %s", text)
recorder.add_user_text(text)
# Marcus transcript
if sc.output_transcription:
text = (sc.output_transcription.text or "").strip()
if text:
log.info("MARCUS: %s", text)
recorder.add_robot_text(text)
# AI audio
if sc.model_turn:
for part in sc.model_turn.parts:
if part.inline_data and part.inline_data.data:
now = time.time()
if not speaking:
ai_speak_start = now
speaking = True
last_ai_audio = now
raw_audio = part.inline_data.data
recorder.capture_robot(raw_audio)
audio = np.frombuffer(
raw_audio, dtype=np.int16)
if not stream_started:
await loop.run_in_executor(
None, speaker.begin_stream)
stream_started = True
await loop.run_in_executor(
None, speaker.send_chunk,
audio, RECEIVE_RATE)
# Turn complete
if sc.turn_complete:
if speaking and stream_started and not speaker.interrupted:
dur = speaker.total_sent_sec
log.info("speaker %.1fs", dur)
await loop.run_in_executor(
None, speaker.wait_finish)
elif speaking and speaker.interrupted:
log.info("speaker interrupted")
speaking = False
stream_started = False
mic.flush()
recorder.finish_turn()
log.info("listening")
# receive() iterator ended — check if session is still alive
if time.time() - last_recv > 30:
log.warning("no messages from Gemini for 30s — session dead")
break
await asyncio.sleep(0.1)
except Exception as e:
log.warning("receive ended: %s", e)
finally:
done.set()
try:
await asyncio.wait_for(
asyncio.gather(send_mic(), receive()),
timeout=_SV.get("session_timeout_sec", 660), # 11 min max (server go_away at ~10 min)
)
except asyncio.TimeoutError:
log.warning("session timed out after 11 min")
except asyncio.CancelledError:
log.warning("session cancelled")
log.info("session #%d ended — reconnecting in 1s", session_num)
speaker.stop()
mic.flush()
await asyncio.sleep(1)
except asyncio.CancelledError:
log.info("cancelled — stopping")
break
except KeyboardInterrupt:
log.info("keyboard interrupt — stopping")
break
except Exception as e:
consecutive_errors += 1
# Exponential backoff: 2s, 4s, 8s, 16s, max 30s
delay = min(30, 2 ** consecutive_errors)
log.error("session error (#%d): %s — reconnecting in %ds",
consecutive_errors, e, delay)
await asyncio.sleep(delay)
# After 10 consecutive errors, restart the client
if consecutive_errors >= 10:
log.warning("10 consecutive errors — recreating client")
try:
client = genai.Client(api_key=API_KEY)
consecutive_errors = 0
except Exception as ce:
log.error("client recreation failed: %s", ce)
# ─── MAIN ────────────────────────────────────────────────
def main():
if len(sys.argv) < 2:
print(__doc__)
sys.exit(1)
iface = sys.argv[1]
voice = VOICE_NAME
if "--voice" in sys.argv:
idx = sys.argv.index("--voice")
voice = sys.argv[idx + 1]
log.info("DDS on %s", iface)
ChannelFactoryInitialize(0, iface)
ac = AudioClient()
ac.SetTimeout(10.0)
ac.Init()
log.info("AudioClient ready")
profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin")
audio = AudioIO.from_profile(profile, audio_client=ac)
audio.start()
mic, speaker = audio.mic, audio.speaker
log.info("audio profile=%s", audio.profile_id)
log.info("testing mic 2s...")
time.sleep(2)
test = mic.read_chunk(1024)
e = audio_energy(test)
log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT")
log.info("voice=%s log=%s", voice, LOG_FILE)
log.info("" * 50)
try:
asyncio.run(run_session(mic, speaker, voice))
except KeyboardInterrupt:
pass
except Exception as e:
log.error("fatal: %s", e)
finally:
log.info("stopped")
audio.stop()
if __name__ == "__main__":
main()