#!/usr/bin/env python3 """Sanad voice subprocess — orchestrator. Wires three independently-swappable pieces together: 1. Audio I/O — voice/audio_io.py (mic + speaker) 2. Turn recorder — TurnRecorder (in this file; model-agnostic WAV capture) 3. Voice brain — gemini/script.py (Gemini, default — cloud) local/script.py (offline — Whisper+Qwen+CosyVoice2) voice/model_script.py (template for new models) Runtime selection: SANAD_AUDIO_PROFILE = builtin | anker | hollyland_builtin (default builtin) SANAD_VOICE_BRAIN = gemini | local | model (default gemini) Usage: python3 voice/sanad_voice.py eth0 python3 voice/sanad_voice.py eth0 --voice Charon SANAD_AUDIO_PROFILE=anker SANAD_VOICE_BRAIN=gemini \\ python3 voice/sanad_voice.py eth0 System prompt priority (first hit wins): 1. scripts/sanad_script.txt (edit-live via the dashboard) 2. config/core_config.json > gemini_defaults.default_system_prompt 3. the hardcoded fallback in _load_system_prompt() below """ from __future__ import annotations import array import asyncio import importlib import json import logging import os import sys import threading import time import types import wave from datetime import datetime from pathlib import Path # ───────────────────────────────────────────────────────────────────────────── # Layout bootstrap — MUST run before any `Project.Sanad.*` import. # This file runs as a standalone subprocess (spawned by gemini/subprocess.py # or local/subprocess.py); it can't rely on main.py having set up sys.path. # Mirrors the dev-vs-deployed detection in main.py. # dev layout: /Project/Sanad/voice/sanad_voice.py # deployed layout: /home/unitree/Sanad/voice/sanad_voice.py # ───────────────────────────────────────────────────────────────────────────── _SANAD_DIR = Path(__file__).resolve().parent.parent # .../Sanad _SANAD_PARENT = _SANAD_DIR.parent # .../Project OR /home/unitree if _SANAD_PARENT.name == "Project": _ROOT = _SANAD_PARENT.parent if str(_ROOT) not in sys.path: sys.path.insert(0, str(_ROOT)) else: if str(_SANAD_PARENT) not in sys.path: sys.path.insert(0, str(_SANAD_PARENT)) if "Project" not in sys.modules: _proj = types.ModuleType("Project") _proj.__path__ = [] # namespace package marker sys.modules["Project"] = _proj if "Project.Sanad" not in sys.modules: _sanad = importlib.import_module(_SANAD_DIR.name) sys.modules["Project.Sanad"] = _sanad sys.modules["Project"].Sanad = _sanad # type: ignore[attr-defined] from unitree_sdk2py.core.channel import ChannelFactoryInitialize from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient from Project.Sanad.config import ( GEMINI_VOICE, RECEIVE_SAMPLE_RATE, SCRIPTS_DIR, SEND_SAMPLE_RATE, ) from Project.Sanad.core.config_loader import section as _cfg_section from Project.Sanad.voice.audio_io import AudioIO # ─── LOGGING ───────────────────────────────────────────── _LOG_CFG = _cfg_section("voice", "sanad_voice") LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs")) os.makedirs(LOG_DIR, exist_ok=True) _LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2") LOG_FILE = os.path.join(LOG_DIR, f"{_LOG_NAME}_{datetime.now():%Y%m%d}.log") logging.basicConfig( level=logging.INFO, format="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s", datefmt="%H:%M:%S", handlers=[ logging.FileHandler(LOG_FILE), logging.StreamHandler(), ], ) log = logging.getLogger("sanad_voice") # ─── CONFIG ────────────────────────────────────────────── _REC = _cfg_section("voice", "recording") _SCRIPTS = _cfg_section("core", "script_files") _GEMINI_DEFAULTS = _cfg_section("core", "gemini_defaults") _PERSONA_FILE = SCRIPTS_DIR / _SCRIPTS.get("persona", "sanad_script.txt") RECORD_ENABLED = os.environ.get( "SANAD_RECORD", "1" if _REC.get("enabled", True) else "0", ) != "0" _REC_DIR_REL = _REC.get("dir_relative", "data/recordings") RECORD_DIR = Path(os.environ.get( "SANAD_RECORD_DIR", str(Path(__file__).resolve().parent.parent / _REC_DIR_REL), )) _FALLBACK_SYSTEM_PROMPT = ( "You are Marcus, a bilingual humanoid robot assistant made by YS Lootah " "Technology, Dubai, UAE. RESPOND IN ARABIC (Gulf/Emirati dialect) OR " "ENGLISH ONLY. YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE " "USER SPEAKS. If the user speaks Arabic, you MUST reply in Arabic Gulf " "dialect. If the user speaks English, you MUST reply in English. Do NOT " "confuse Arabic with Japanese, Hindi, Russian, or any other language. " "The user is speaking Arabic or English — nothing else. Be concise — 1 " "to 2 sentences max. Be friendly and natural. If the user interrupts " "and says 'continue' or 'كمل', resume EXACTLY where you stopped. Only " "respond to clear human speech. Ignore background noise and silence " "completely. Do not respond to sounds that are not words." ) def _load_system_prompt() -> str: """scripts/sanad_script.txt → config default → hardcoded fallback.""" try: text = _PERSONA_FILE.read_text(encoding="utf-8-sig").strip() if text: return text except FileNotFoundError: pass return _GEMINI_DEFAULTS.get("default_system_prompt", _FALLBACK_SYSTEM_PROMPT) def _audio_energy(pcm: bytes) -> int: try: samples = array.array("h", pcm) return sum(abs(s) for s in samples) // len(samples) if samples else 0 except Exception: return 0 # ─── TURN RECORDER ────────────────────────────────────── class TurnRecorder: """Saves each turn as two WAV files: user mic + model output. A turn starts when user audio starts flowing through `capture_user` and ends on `finish_turn`. Files land in `RECORD_DIR` as `_user.wav` (at `user_rate`) and `_robot.wav` (at `robot_rate`). An `index.json` in the same directory tracks every turn with timestamp + transcripts + durations for the dashboard. """ def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR, user_rate: int = SEND_SAMPLE_RATE, robot_rate: int = RECEIVE_SAMPLE_RATE): self.enabled = enabled self.out_dir = out_dir self.user_rate = user_rate self.robot_rate = robot_rate if self.enabled: self.out_dir.mkdir(parents=True, exist_ok=True) self._lock = threading.Lock() self._user_buf: list[bytes] = [] self._robot_buf: list[bytes] = [] self._user_text = "" self._robot_text = "" self._started_at: float = 0.0 def capture_user(self, pcm: bytes) -> None: if not self.enabled or not pcm: return with self._lock: if not self._user_buf and not self._robot_buf: self._started_at = time.time() self._user_buf.append(pcm) def capture_robot(self, pcm: bytes) -> None: if not self.enabled or not pcm: return with self._lock: if not self._user_buf and not self._robot_buf: self._started_at = time.time() self._robot_buf.append(pcm) def add_user_text(self, text: str) -> None: if text and self.enabled: with self._lock: self._user_text = (self._user_text + " " + text).strip() def add_robot_text(self, text: str) -> None: if text and self.enabled: with self._lock: self._robot_text = (self._robot_text + " " + text).strip() def finish_turn(self) -> dict: if not self.enabled: return {} with self._lock: user_data = b"".join(self._user_buf) robot_data = b"".join(self._robot_buf) user_text = self._user_text robot_text = self._robot_text started_at = self._started_at self._user_buf.clear() self._robot_buf.clear() self._user_text = "" self._robot_text = "" if not user_data and not robot_data: return {} stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S") entry = {"timestamp": stamp, "started_at": started_at, "user_text": user_text, "robot_text": robot_text} try: if user_data: p = self.out_dir / f"{stamp}_user.wav" self._save_wav(p, user_data, self.user_rate) entry["user_wav"] = str(p) entry["user_duration_sec"] = round( len(user_data) / (self.user_rate * 2), 3) if robot_data: p = self.out_dir / f"{stamp}_robot.wav" self._save_wav(p, robot_data, self.robot_rate) entry["robot_wav"] = str(p) entry["robot_duration_sec"] = round( len(robot_data) / (self.robot_rate * 2), 3) self._append_index(entry) log.info("recorded turn → %s (user %.1fs, robot %.1fs)", stamp, entry.get("user_duration_sec", 0), entry.get("robot_duration_sec", 0)) except Exception as exc: log.warning("recording save failed: %s", exc) return entry @staticmethod def _save_wav(path: Path, pcm: bytes, rate: int) -> None: with wave.open(str(path), "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(rate) wf.writeframes(pcm) def _append_index(self, entry: dict) -> None: idx_path = self.out_dir / "index.json" try: if idx_path.exists(): payload = json.loads(idx_path.read_text(encoding="utf-8")) if not isinstance(payload, dict): payload = {"records": []} else: payload = {"records": []} except Exception: payload = {"records": []} payload.setdefault("records", []).append(entry) payload["total_records"] = len(payload["records"]) idx_path.write_text( json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8", ) # ─── BRAIN FACTORY ─────────────────────────────────────── def _build_brain(name: str, audio_io, recorder, voice: str, system_prompt: str): name = (name or "").strip().lower() if name in ("", "gemini"): from Project.Sanad.gemini.script import GeminiBrain return GeminiBrain(audio_io, recorder, voice, system_prompt) if name == "local": from Project.Sanad.local.script import LocalBrain return LocalBrain(audio_io, recorder, voice, system_prompt) if name == "model": from Project.Sanad.voice.model_script import ModelBrain return ModelBrain(audio_io, recorder, voice, system_prompt) # To add a provider: import the module and return its brain class here. raise ValueError(f"unknown voice brain: {name!r}") # ─── MAIN ──────────────────────────────────────────────── def main() -> None: if len(sys.argv) < 2: print(__doc__) sys.exit(1) iface = sys.argv[1] voice = GEMINI_VOICE if "--voice" in sys.argv: voice = sys.argv[sys.argv.index("--voice") + 1] log.info("DDS on %s", iface) ChannelFactoryInitialize(0, iface) ac = AudioClient() ac.SetTimeout(10.0) ac.Init() log.info("AudioClient ready") profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin") audio = AudioIO.from_profile(profile, audio_client=ac) audio.start() log.info("audio profile=%s", audio.profile_id) # Sanity-check the mic before handing it to the brain log.info("testing mic 2s...") time.sleep(2) test = audio.mic.read_chunk(1024) e = _audio_energy(test) log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT") recorder = TurnRecorder(enabled=RECORD_ENABLED) if RECORD_ENABLED: log.info("recording enabled → %s", RECORD_DIR) system_prompt = _load_system_prompt() brain_name = os.environ.get("SANAD_VOICE_BRAIN", "gemini") brain = _build_brain(brain_name, audio, recorder, voice, system_prompt) log.info("voice brain=%s voice=%s log=%s", brain_name, voice, LOG_FILE) log.info("─" * 50) try: asyncio.run(brain.run()) except KeyboardInterrupt: pass except Exception as exc: log.error("fatal: %s", exc) finally: log.info("stopping") try: brain.stop() except Exception: log.warning("brain.stop() failed", exc_info=True) audio.stop() log.info("stopped") if __name__ == "__main__": main()