582 lines
24 KiB
Python
582 lines
24 KiB
Python
#!/usr/bin/env python3
|
|
"""Sanad voice subprocess — Gemini Live (google-genai SDK) on the G1.
|
|
|
|
Mic/speaker are selected at startup via `SANAD_AUDIO_PROFILE` (builtin |
|
|
anker | hollyland_builtin), materialised by `voice/audio_io.py`. The
|
|
default ("builtin") is UDP multicast mic + AudioClient.PlayStream.
|
|
|
|
Features: mic gain, echo suppression, barge-in, wait-for-user,
|
|
streaming playback, per-turn WAV recording.
|
|
|
|
Usage:
|
|
python3 voice/sanad_voice.py eth0
|
|
python3 voice/sanad_voice.py eth0 --voice Charon
|
|
SANAD_AUDIO_PROFILE=anker python3 voice/sanad_voice.py eth0
|
|
"""
|
|
|
|
import array
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import threading
|
|
import time
|
|
import wave
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
|
|
from google import genai
|
|
from google.genai import types
|
|
|
|
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
|
|
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
|
|
|
|
from Project.Sanad.voice.audio_io import AudioIO, Mic, Speaker
|
|
|
|
# ─── LOGGING ─────────────────────────────────────────────
|
|
|
|
try:
|
|
from Project.Sanad.core.config_loader import section as _cfg_section_log
|
|
_LOG_CFG = _cfg_section_log("voice", "sanad_voice")
|
|
except Exception:
|
|
_LOG_CFG = {}
|
|
LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs"))
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|
_LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2")
|
|
LOG_FILE = os.path.join(LOG_DIR, f"{_LOG_NAME}_{datetime.now():%Y%m%d}.log")
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
handlers=[
|
|
logging.FileHandler(LOG_FILE),
|
|
logging.StreamHandler(),
|
|
],
|
|
)
|
|
log = logging.getLogger("gemini_v2")
|
|
|
|
# ─── CONFIG — single source of truth ─────────────────────
|
|
#
|
|
# Gemini credentials + audio rates live in config/core_config.json
|
|
# (exposed via config.py as GEMINI_API_KEY, GEMINI_MODEL, etc).
|
|
# Voice-loop-specific tunables live in config/voice_config.json.
|
|
try:
|
|
from Project.Sanad.config import (
|
|
GEMINI_API_KEY, GEMINI_VOICE,
|
|
SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE,
|
|
)
|
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
|
_SV = _cfg_section("voice", "sanad_voice")
|
|
_MIC = _cfg_section("voice", "mic_udp")
|
|
_SP = _cfg_section("voice", "speaker")
|
|
_REC = _cfg_section("voice", "recording")
|
|
except Exception:
|
|
GEMINI_API_KEY, GEMINI_VOICE = "", "Charon"
|
|
SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE = 16000, 24000, 512
|
|
_SV, _MIC, _SP, _REC = {}, {}, {}, {}
|
|
|
|
API_KEY = GEMINI_API_KEY
|
|
# Gemini Live model name (without "models/" prefix expected by google-genai SDK)
|
|
MODEL = os.environ.get("SANAD_GEMINI_MODEL",
|
|
"gemini-2.5-flash-native-audio-preview-12-2025")
|
|
VOICE_NAME = GEMINI_VOICE
|
|
|
|
SEND_RATE = SEND_SAMPLE_RATE
|
|
RECEIVE_RATE = RECEIVE_SAMPLE_RATE
|
|
CHUNK_SAMPLES = CHUNK_SIZE
|
|
MIC_GAIN = _SV.get("mic_gain", 1.0)
|
|
|
|
PLAY_CHUNK = _SV.get("play_chunk_bytes", 96000)
|
|
SILENCE_PCM = b'\x00' * (CHUNK_SAMPLES * 2)
|
|
|
|
# ─── RECORDING ───────────────────────────────────────────
|
|
RECORD_ENABLED = os.environ.get("SANAD_RECORD",
|
|
"1" if _REC.get("enabled", True) else "0") != "0"
|
|
_rec_dir_rel = _REC.get("dir_relative", "data/recordings")
|
|
RECORD_DIR = Path(
|
|
os.environ.get(
|
|
"SANAD_RECORD_DIR",
|
|
str(Path(__file__).resolve().parent.parent / _rec_dir_rel),
|
|
)
|
|
)
|
|
|
|
SYSTEM_PROMPT = (
|
|
"You are Marcus, a bilingual humanoid robot assistant made by YS Lootah Technology, Dubai, UAE. "
|
|
"RESPOND IN ARABIC (Gulf/Emirati dialect) OR ENGLISH ONLY. "
|
|
"YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE USER SPEAKS. "
|
|
"If the user speaks Arabic, you MUST reply in Arabic Gulf dialect. "
|
|
"If the user speaks English, you MUST reply in English. "
|
|
"Do NOT confuse Arabic with Japanese, Hindi, Russian, or any other language. "
|
|
"The user is speaking Arabic or English — nothing else. "
|
|
"Be concise — 1 to 2 sentences max. Be friendly and natural. "
|
|
"If the user interrupts and says 'continue' or 'كمل', resume EXACTLY where you stopped. "
|
|
"Only respond to clear human speech. Ignore background noise and silence completely. "
|
|
"Do not respond to sounds that are not words."
|
|
)
|
|
|
|
|
|
# ─── HELPERS ─────────────────────────────────────────────
|
|
|
|
def audio_energy(pcm: bytes) -> int:
|
|
try:
|
|
samples = array.array("h", pcm)
|
|
return sum(abs(s) for s in samples) // len(samples) if samples else 0
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
# ─── TURN RECORDER ──────────────────────────────────────
|
|
|
|
class TurnRecorder:
|
|
"""Saves each turn as two WAV files: user mic + Gemini output.
|
|
|
|
A turn starts when user audio starts flowing through `capture_user`
|
|
and ends on `turn_complete`. Files land in SANAD_RECORD_DIR as
|
|
`<timestamp>_user.wav` (16 kHz) and `<timestamp>_robot.wav` (24 kHz).
|
|
|
|
An `index.json` maintains a list of all turns with metadata
|
|
(timestamp, text transcripts, durations) so the dashboard can
|
|
browse them later.
|
|
"""
|
|
|
|
def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR):
|
|
self.enabled = enabled
|
|
self.out_dir = out_dir
|
|
if self.enabled:
|
|
self.out_dir.mkdir(parents=True, exist_ok=True)
|
|
self._lock = threading.Lock()
|
|
self._user_buf: list[bytes] = []
|
|
self._robot_buf: list[bytes] = []
|
|
self._user_text = ""
|
|
self._robot_text = ""
|
|
self._started_at: float = 0.0
|
|
|
|
def capture_user(self, pcm: bytes) -> None:
|
|
if not self.enabled or not pcm:
|
|
return
|
|
with self._lock:
|
|
if not self._user_buf and not self._robot_buf:
|
|
self._started_at = time.time()
|
|
self._user_buf.append(pcm)
|
|
|
|
def capture_robot(self, pcm: bytes) -> None:
|
|
if not self.enabled or not pcm:
|
|
return
|
|
with self._lock:
|
|
if not self._user_buf and not self._robot_buf:
|
|
self._started_at = time.time()
|
|
self._robot_buf.append(pcm)
|
|
|
|
def add_user_text(self, text: str) -> None:
|
|
if text and self.enabled:
|
|
with self._lock:
|
|
self._user_text = (self._user_text + " " + text).strip()
|
|
|
|
def add_robot_text(self, text: str) -> None:
|
|
if text and self.enabled:
|
|
with self._lock:
|
|
self._robot_text = (self._robot_text + " " + text).strip()
|
|
|
|
def finish_turn(self) -> dict:
|
|
"""Save current buffers to disk, reset state, return metadata."""
|
|
if not self.enabled:
|
|
return {}
|
|
with self._lock:
|
|
user_data = b"".join(self._user_buf)
|
|
robot_data = b"".join(self._robot_buf)
|
|
user_text = self._user_text
|
|
robot_text = self._robot_text
|
|
started_at = self._started_at
|
|
self._user_buf.clear()
|
|
self._robot_buf.clear()
|
|
self._user_text = ""
|
|
self._robot_text = ""
|
|
|
|
if not user_data and not robot_data:
|
|
return {}
|
|
|
|
stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S")
|
|
entry = {"timestamp": stamp, "started_at": started_at,
|
|
"user_text": user_text, "robot_text": robot_text}
|
|
try:
|
|
if user_data:
|
|
user_path = self.out_dir / f"{stamp}_user.wav"
|
|
self._save_wav(user_path, user_data, SEND_RATE)
|
|
entry["user_wav"] = str(user_path)
|
|
entry["user_duration_sec"] = round(len(user_data) / (SEND_RATE * 2), 3)
|
|
if robot_data:
|
|
robot_path = self.out_dir / f"{stamp}_robot.wav"
|
|
self._save_wav(robot_path, robot_data, RECEIVE_RATE)
|
|
entry["robot_wav"] = str(robot_path)
|
|
entry["robot_duration_sec"] = round(len(robot_data) / (RECEIVE_RATE * 2), 3)
|
|
self._append_index(entry)
|
|
log.info("recorded turn → %s (user %.1fs, robot %.1fs)",
|
|
stamp,
|
|
entry.get("user_duration_sec", 0),
|
|
entry.get("robot_duration_sec", 0))
|
|
except Exception as exc:
|
|
log.warning("recording save failed: %s", exc)
|
|
return entry
|
|
|
|
def _save_wav(self, path: Path, pcm: bytes, rate: int) -> None:
|
|
with wave.open(str(path), "wb") as wf:
|
|
wf.setnchannels(1)
|
|
wf.setsampwidth(2)
|
|
wf.setframerate(rate)
|
|
wf.writeframes(pcm)
|
|
|
|
def _append_index(self, entry: dict) -> None:
|
|
idx_path = self.out_dir / "index.json"
|
|
try:
|
|
if idx_path.exists():
|
|
payload = json.loads(idx_path.read_text(encoding="utf-8"))
|
|
if not isinstance(payload, dict):
|
|
payload = {"records": []}
|
|
else:
|
|
payload = {"records": []}
|
|
except Exception:
|
|
payload = {"records": []}
|
|
payload.setdefault("records", []).append(entry)
|
|
payload["total_records"] = len(payload["records"])
|
|
idx_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
|
|
encoding="utf-8")
|
|
|
|
|
|
# Mic + speaker classes now live in voice/audio_io.py — built via
|
|
# AudioIO.from_profile() in main(). Selected with SANAD_AUDIO_PROFILE
|
|
# (builtin | anker | hollyland_builtin).
|
|
|
|
|
|
# ─── SESSION ─────────────────────────────────────────────
|
|
|
|
async def run_session(mic: Mic, speaker: Speaker, voice: str):
|
|
client = genai.Client(api_key=API_KEY)
|
|
recorder = TurnRecorder(enabled=RECORD_ENABLED)
|
|
if RECORD_ENABLED:
|
|
log.info("recording enabled → %s", RECORD_DIR)
|
|
|
|
config = types.LiveConnectConfig(
|
|
response_modalities=["AUDIO"],
|
|
speech_config=types.SpeechConfig(
|
|
voice_config=types.VoiceConfig(
|
|
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
|
voice_name=voice
|
|
)
|
|
)
|
|
),
|
|
realtime_input_config=types.RealtimeInputConfig(
|
|
automatic_activity_detection=types.AutomaticActivityDetection(
|
|
disabled=False,
|
|
start_of_speech_sensitivity=getattr(
|
|
types.StartSensitivity,
|
|
_cfg_section("voice", "vad").get(
|
|
"start_sensitivity", "START_SENSITIVITY_HIGH")),
|
|
end_of_speech_sensitivity=getattr(
|
|
types.EndSensitivity,
|
|
_cfg_section("voice", "vad").get(
|
|
"end_sensitivity", "END_SENSITIVITY_LOW")),
|
|
prefix_padding_ms=_cfg_section("voice", "vad").get("prefix_padding_ms", 20),
|
|
silence_duration_ms=_cfg_section("voice", "vad").get("silence_duration_ms", 200),
|
|
)
|
|
),
|
|
input_audio_transcription=types.AudioTranscriptionConfig(),
|
|
output_audio_transcription=types.AudioTranscriptionConfig(),
|
|
system_instruction=types.Content(
|
|
parts=[types.Part(text=SYSTEM_PROMPT)]
|
|
),
|
|
)
|
|
|
|
session_num = 0
|
|
start_time = time.time()
|
|
consecutive_errors = 0
|
|
|
|
while True:
|
|
session_num += 1
|
|
speaking = False
|
|
stream_started = False
|
|
barge_block_until = 0.0
|
|
ai_speak_start = 0.0
|
|
last_ai_audio = 0.0
|
|
|
|
_bi = _cfg_section("voice", "barge_in")
|
|
BARGE_THRESHOLD = _bi.get("threshold", 500)
|
|
LOUD_CHUNKS_NEEDED = _bi.get("loud_chunks_needed", 3)
|
|
BARGE_COOLDOWN = _bi.get("cooldown_sec", 0.3)
|
|
ECHO_SUPPRESS_BELOW = _bi.get("echo_suppress_below", 500)
|
|
AI_SPEAK_GRACE_SEC = _bi.get("ai_speak_grace_sec", 0.15)
|
|
|
|
uptime_min = (time.time() - start_time) / 60
|
|
|
|
try:
|
|
log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
|
|
session_num, uptime_min)
|
|
async with client.aio.live.connect(model=MODEL, config=config) as session:
|
|
log.info("connected — speak anytime!")
|
|
consecutive_errors = 0 # reset on successful connect
|
|
mic.flush()
|
|
done = asyncio.Event()
|
|
|
|
# ── Send mic ──
|
|
async def send_mic():
|
|
nonlocal speaking, barge_block_until
|
|
chunk_bytes = CHUNK_SAMPLES * 2
|
|
loud_count = 0
|
|
last_activity = time.time()
|
|
loop = asyncio.get_event_loop()
|
|
|
|
while not done.is_set():
|
|
try:
|
|
raw = await loop.run_in_executor(
|
|
None, lambda: mic.read_chunk(chunk_bytes))
|
|
except Exception:
|
|
break
|
|
|
|
# Amplify
|
|
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
|
|
samples = np.clip(samples * MIC_GAIN, -32768, 32767).astype(np.int16)
|
|
data = samples.tobytes()
|
|
energy = audio_energy(data)
|
|
now = time.time()
|
|
|
|
# Barge-in
|
|
if speaking and now >= barge_block_until:
|
|
if (now - ai_speak_start) >= AI_SPEAK_GRACE_SEC:
|
|
if energy > BARGE_THRESHOLD:
|
|
loud_count += 1
|
|
else:
|
|
loud_count = max(0, loud_count - 1)
|
|
if loud_count > LOUD_CHUNKS_NEEDED:
|
|
log.info("BARGE-IN (e=%d)", energy)
|
|
do_interrupt("barge-in")
|
|
loud_count = 0
|
|
barge_block_until = now + BARGE_COOLDOWN
|
|
|
|
# Echo suppression
|
|
send_data = data
|
|
if speaking and energy < ECHO_SUPPRESS_BELOW:
|
|
send_data = SILENCE_PCM[:chunk_bytes]
|
|
|
|
# Record user audio (only when clearly speaking,
|
|
# energy > 250 — skip ambient silence noise)
|
|
if energy > 250 and not speaking:
|
|
recorder.capture_user(data)
|
|
|
|
# Watchdog
|
|
if energy > 250:
|
|
last_activity = now
|
|
elif now - last_activity > 10:
|
|
log.info("alive (no speech %.0fs, e=%d, buf=%d)",
|
|
now - last_activity, energy, len(mic._buf))
|
|
last_activity = now
|
|
|
|
try:
|
|
await session.send_realtime_input(
|
|
audio=types.Blob(
|
|
data=send_data,
|
|
mime_type=f"audio/pcm;rate={SEND_RATE}"
|
|
)
|
|
)
|
|
except asyncio.CancelledError:
|
|
return
|
|
except Exception as e:
|
|
log.warning("mic send failed: %s — ending session", e)
|
|
done.set()
|
|
return
|
|
|
|
await asyncio.sleep(CHUNK_SAMPLES / SEND_RATE)
|
|
|
|
log.info("send_mic task ended")
|
|
|
|
# ── Interrupt helper ──
|
|
def do_interrupt(source="local"):
|
|
nonlocal speaking, stream_started
|
|
speaking = False
|
|
stream_started = False
|
|
speaker.stop()
|
|
mic.flush()
|
|
recorder.finish_turn()
|
|
|
|
# ── Receive ──
|
|
async def receive():
|
|
nonlocal speaking, stream_started
|
|
nonlocal ai_speak_start, last_ai_audio
|
|
loop = asyncio.get_event_loop()
|
|
|
|
try:
|
|
last_recv = time.time()
|
|
while not done.is_set():
|
|
async for response in session.receive():
|
|
last_recv = time.time()
|
|
if done.is_set():
|
|
break
|
|
|
|
# Server going away — reconnect soon
|
|
if hasattr(response, 'go_away') and response.go_away is not None:
|
|
log.info("server going away — will reconnect")
|
|
done.set()
|
|
return
|
|
|
|
sc = response.server_content
|
|
if sc is None:
|
|
continue
|
|
|
|
# Gemini interrupted
|
|
if sc.interrupted is True:
|
|
if speaking:
|
|
log.info("Gemini interrupted")
|
|
do_interrupt("gemini")
|
|
continue
|
|
|
|
# User transcript
|
|
if sc.input_transcription:
|
|
text = (sc.input_transcription.text or "").strip()
|
|
if text and not speaking:
|
|
log.info("USER: %s", text)
|
|
recorder.add_user_text(text)
|
|
|
|
# Marcus transcript
|
|
if sc.output_transcription:
|
|
text = (sc.output_transcription.text or "").strip()
|
|
if text:
|
|
log.info("MARCUS: %s", text)
|
|
recorder.add_robot_text(text)
|
|
|
|
# AI audio
|
|
if sc.model_turn:
|
|
for part in sc.model_turn.parts:
|
|
if part.inline_data and part.inline_data.data:
|
|
now = time.time()
|
|
if not speaking:
|
|
ai_speak_start = now
|
|
speaking = True
|
|
last_ai_audio = now
|
|
raw_audio = part.inline_data.data
|
|
recorder.capture_robot(raw_audio)
|
|
audio = np.frombuffer(
|
|
raw_audio, dtype=np.int16)
|
|
if not stream_started:
|
|
await loop.run_in_executor(
|
|
None, speaker.begin_stream)
|
|
stream_started = True
|
|
await loop.run_in_executor(
|
|
None, speaker.send_chunk,
|
|
audio, RECEIVE_RATE)
|
|
|
|
# Turn complete
|
|
if sc.turn_complete:
|
|
if speaking and stream_started and not speaker.interrupted:
|
|
dur = speaker.total_sent_sec
|
|
log.info("speaker %.1fs", dur)
|
|
await loop.run_in_executor(
|
|
None, speaker.wait_finish)
|
|
elif speaking and speaker.interrupted:
|
|
log.info("speaker interrupted")
|
|
speaking = False
|
|
stream_started = False
|
|
mic.flush()
|
|
recorder.finish_turn()
|
|
log.info("listening")
|
|
|
|
# receive() iterator ended — check if session is still alive
|
|
if time.time() - last_recv > 30:
|
|
log.warning("no messages from Gemini for 30s — session dead")
|
|
break
|
|
await asyncio.sleep(0.1)
|
|
|
|
except Exception as e:
|
|
log.warning("receive ended: %s", e)
|
|
finally:
|
|
done.set()
|
|
|
|
try:
|
|
await asyncio.wait_for(
|
|
asyncio.gather(send_mic(), receive()),
|
|
timeout=_SV.get("session_timeout_sec", 660), # 11 min max (server go_away at ~10 min)
|
|
)
|
|
except asyncio.TimeoutError:
|
|
log.warning("session timed out after 11 min")
|
|
except asyncio.CancelledError:
|
|
log.warning("session cancelled")
|
|
|
|
log.info("session #%d ended — reconnecting in 1s", session_num)
|
|
speaker.stop()
|
|
mic.flush()
|
|
await asyncio.sleep(1)
|
|
|
|
except asyncio.CancelledError:
|
|
log.info("cancelled — stopping")
|
|
break
|
|
except KeyboardInterrupt:
|
|
log.info("keyboard interrupt — stopping")
|
|
break
|
|
except Exception as e:
|
|
consecutive_errors += 1
|
|
# Exponential backoff: 2s, 4s, 8s, 16s, max 30s
|
|
delay = min(30, 2 ** consecutive_errors)
|
|
log.error("session error (#%d): %s — reconnecting in %ds",
|
|
consecutive_errors, e, delay)
|
|
await asyncio.sleep(delay)
|
|
|
|
# After 10 consecutive errors, restart the client
|
|
if consecutive_errors >= 10:
|
|
log.warning("10 consecutive errors — recreating client")
|
|
try:
|
|
client = genai.Client(api_key=API_KEY)
|
|
consecutive_errors = 0
|
|
except Exception as ce:
|
|
log.error("client recreation failed: %s", ce)
|
|
|
|
|
|
# ─── MAIN ────────────────────────────────────────────────
|
|
|
|
def main():
|
|
if len(sys.argv) < 2:
|
|
print(__doc__)
|
|
sys.exit(1)
|
|
|
|
iface = sys.argv[1]
|
|
voice = VOICE_NAME
|
|
if "--voice" in sys.argv:
|
|
idx = sys.argv.index("--voice")
|
|
voice = sys.argv[idx + 1]
|
|
|
|
log.info("DDS on %s", iface)
|
|
ChannelFactoryInitialize(0, iface)
|
|
ac = AudioClient()
|
|
ac.SetTimeout(10.0)
|
|
ac.Init()
|
|
log.info("AudioClient ready")
|
|
|
|
profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin")
|
|
audio = AudioIO.from_profile(profile, audio_client=ac)
|
|
audio.start()
|
|
mic, speaker = audio.mic, audio.speaker
|
|
log.info("audio profile=%s", audio.profile_id)
|
|
|
|
log.info("testing mic 2s...")
|
|
time.sleep(2)
|
|
test = mic.read_chunk(1024)
|
|
e = audio_energy(test)
|
|
log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT")
|
|
|
|
log.info("voice=%s log=%s", voice, LOG_FILE)
|
|
log.info("─" * 50)
|
|
|
|
try:
|
|
asyncio.run(run_session(mic, speaker, voice))
|
|
except KeyboardInterrupt:
|
|
pass
|
|
except Exception as e:
|
|
log.error("fatal: %s", e)
|
|
finally:
|
|
log.info("stopped")
|
|
audio.stop()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|