325 lines
12 KiB
Python
325 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""Sanad voice subprocess — orchestrator.
|
|
|
|
Wires three independently-swappable pieces together:
|
|
|
|
1. Audio I/O — voice/audio_io.py (mic + speaker)
|
|
2. Turn recorder — TurnRecorder (in this file; model-agnostic WAV capture)
|
|
3. Voice brain — gemini/script.py (Gemini, default — cloud)
|
|
local/script.py (offline — Whisper+Qwen+CosyVoice2)
|
|
voice/model_script.py (template for new models)
|
|
|
|
Runtime selection:
|
|
SANAD_AUDIO_PROFILE = builtin | anker | hollyland_builtin (default builtin)
|
|
SANAD_VOICE_BRAIN = gemini | local | model (default gemini)
|
|
|
|
Usage:
|
|
python3 voice/sanad_voice.py eth0
|
|
python3 voice/sanad_voice.py eth0 --voice Charon
|
|
SANAD_AUDIO_PROFILE=anker SANAD_VOICE_BRAIN=gemini \\
|
|
python3 voice/sanad_voice.py eth0
|
|
|
|
System prompt priority (first hit wins):
|
|
1. scripts/sanad_script.txt (edit-live via the dashboard)
|
|
2. config/core_config.json > gemini_defaults.default_system_prompt
|
|
3. the hardcoded fallback in _load_system_prompt() below
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import array
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
import threading
|
|
import time
|
|
import wave
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
|
|
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
|
|
|
|
from Project.Sanad.config import (
|
|
GEMINI_VOICE,
|
|
RECEIVE_SAMPLE_RATE,
|
|
SCRIPTS_DIR,
|
|
SEND_SAMPLE_RATE,
|
|
)
|
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
|
from Project.Sanad.voice.audio_io import AudioIO
|
|
|
|
# ─── LOGGING ─────────────────────────────────────────────
|
|
|
|
_LOG_CFG = _cfg_section("voice", "sanad_voice")
|
|
LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs"))
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|
_LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2")
|
|
LOG_FILE = os.path.join(LOG_DIR, f"{_LOG_NAME}_{datetime.now():%Y%m%d}.log")
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
handlers=[
|
|
logging.FileHandler(LOG_FILE),
|
|
logging.StreamHandler(),
|
|
],
|
|
)
|
|
log = logging.getLogger("sanad_voice")
|
|
|
|
|
|
# ─── CONFIG ──────────────────────────────────────────────
|
|
|
|
_REC = _cfg_section("voice", "recording")
|
|
_SCRIPTS = _cfg_section("core", "script_files")
|
|
_GEMINI_DEFAULTS = _cfg_section("core", "gemini_defaults")
|
|
|
|
_PERSONA_FILE = SCRIPTS_DIR / _SCRIPTS.get("persona", "sanad_script.txt")
|
|
|
|
RECORD_ENABLED = os.environ.get(
|
|
"SANAD_RECORD",
|
|
"1" if _REC.get("enabled", True) else "0",
|
|
) != "0"
|
|
_REC_DIR_REL = _REC.get("dir_relative", "data/recordings")
|
|
RECORD_DIR = Path(os.environ.get(
|
|
"SANAD_RECORD_DIR",
|
|
str(Path(__file__).resolve().parent.parent / _REC_DIR_REL),
|
|
))
|
|
|
|
_FALLBACK_SYSTEM_PROMPT = (
|
|
"You are Marcus, a bilingual humanoid robot assistant made by YS Lootah "
|
|
"Technology, Dubai, UAE. RESPOND IN ARABIC (Gulf/Emirati dialect) OR "
|
|
"ENGLISH ONLY. YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE "
|
|
"USER SPEAKS. If the user speaks Arabic, you MUST reply in Arabic Gulf "
|
|
"dialect. If the user speaks English, you MUST reply in English. Do NOT "
|
|
"confuse Arabic with Japanese, Hindi, Russian, or any other language. "
|
|
"The user is speaking Arabic or English — nothing else. Be concise — 1 "
|
|
"to 2 sentences max. Be friendly and natural. If the user interrupts "
|
|
"and says 'continue' or 'كمل', resume EXACTLY where you stopped. Only "
|
|
"respond to clear human speech. Ignore background noise and silence "
|
|
"completely. Do not respond to sounds that are not words."
|
|
)
|
|
|
|
|
|
def _load_system_prompt() -> str:
|
|
"""scripts/sanad_script.txt → config default → hardcoded fallback."""
|
|
try:
|
|
text = _PERSONA_FILE.read_text(encoding="utf-8-sig").strip()
|
|
if text:
|
|
return text
|
|
except FileNotFoundError:
|
|
pass
|
|
return _GEMINI_DEFAULTS.get("default_system_prompt", _FALLBACK_SYSTEM_PROMPT)
|
|
|
|
|
|
def _audio_energy(pcm: bytes) -> int:
|
|
try:
|
|
samples = array.array("h", pcm)
|
|
return sum(abs(s) for s in samples) // len(samples) if samples else 0
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
# ─── TURN RECORDER ──────────────────────────────────────
|
|
|
|
class TurnRecorder:
|
|
"""Saves each turn as two WAV files: user mic + model output.
|
|
|
|
A turn starts when user audio starts flowing through `capture_user`
|
|
and ends on `finish_turn`. Files land in `RECORD_DIR` as
|
|
`<timestamp>_user.wav` (at `user_rate`) and `<timestamp>_robot.wav`
|
|
(at `robot_rate`). An `index.json` in the same directory tracks
|
|
every turn with timestamp + transcripts + durations for the dashboard.
|
|
"""
|
|
|
|
def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR,
|
|
user_rate: int = SEND_SAMPLE_RATE,
|
|
robot_rate: int = RECEIVE_SAMPLE_RATE):
|
|
self.enabled = enabled
|
|
self.out_dir = out_dir
|
|
self.user_rate = user_rate
|
|
self.robot_rate = robot_rate
|
|
if self.enabled:
|
|
self.out_dir.mkdir(parents=True, exist_ok=True)
|
|
self._lock = threading.Lock()
|
|
self._user_buf: list[bytes] = []
|
|
self._robot_buf: list[bytes] = []
|
|
self._user_text = ""
|
|
self._robot_text = ""
|
|
self._started_at: float = 0.0
|
|
|
|
def capture_user(self, pcm: bytes) -> None:
|
|
if not self.enabled or not pcm:
|
|
return
|
|
with self._lock:
|
|
if not self._user_buf and not self._robot_buf:
|
|
self._started_at = time.time()
|
|
self._user_buf.append(pcm)
|
|
|
|
def capture_robot(self, pcm: bytes) -> None:
|
|
if not self.enabled or not pcm:
|
|
return
|
|
with self._lock:
|
|
if not self._user_buf and not self._robot_buf:
|
|
self._started_at = time.time()
|
|
self._robot_buf.append(pcm)
|
|
|
|
def add_user_text(self, text: str) -> None:
|
|
if text and self.enabled:
|
|
with self._lock:
|
|
self._user_text = (self._user_text + " " + text).strip()
|
|
|
|
def add_robot_text(self, text: str) -> None:
|
|
if text and self.enabled:
|
|
with self._lock:
|
|
self._robot_text = (self._robot_text + " " + text).strip()
|
|
|
|
def finish_turn(self) -> dict:
|
|
if not self.enabled:
|
|
return {}
|
|
with self._lock:
|
|
user_data = b"".join(self._user_buf)
|
|
robot_data = b"".join(self._robot_buf)
|
|
user_text = self._user_text
|
|
robot_text = self._robot_text
|
|
started_at = self._started_at
|
|
self._user_buf.clear()
|
|
self._robot_buf.clear()
|
|
self._user_text = ""
|
|
self._robot_text = ""
|
|
|
|
if not user_data and not robot_data:
|
|
return {}
|
|
|
|
stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S")
|
|
entry = {"timestamp": stamp, "started_at": started_at,
|
|
"user_text": user_text, "robot_text": robot_text}
|
|
try:
|
|
if user_data:
|
|
p = self.out_dir / f"{stamp}_user.wav"
|
|
self._save_wav(p, user_data, self.user_rate)
|
|
entry["user_wav"] = str(p)
|
|
entry["user_duration_sec"] = round(
|
|
len(user_data) / (self.user_rate * 2), 3)
|
|
if robot_data:
|
|
p = self.out_dir / f"{stamp}_robot.wav"
|
|
self._save_wav(p, robot_data, self.robot_rate)
|
|
entry["robot_wav"] = str(p)
|
|
entry["robot_duration_sec"] = round(
|
|
len(robot_data) / (self.robot_rate * 2), 3)
|
|
self._append_index(entry)
|
|
log.info("recorded turn → %s (user %.1fs, robot %.1fs)",
|
|
stamp,
|
|
entry.get("user_duration_sec", 0),
|
|
entry.get("robot_duration_sec", 0))
|
|
except Exception as exc:
|
|
log.warning("recording save failed: %s", exc)
|
|
return entry
|
|
|
|
@staticmethod
|
|
def _save_wav(path: Path, pcm: bytes, rate: int) -> None:
|
|
with wave.open(str(path), "wb") as wf:
|
|
wf.setnchannels(1)
|
|
wf.setsampwidth(2)
|
|
wf.setframerate(rate)
|
|
wf.writeframes(pcm)
|
|
|
|
def _append_index(self, entry: dict) -> None:
|
|
idx_path = self.out_dir / "index.json"
|
|
try:
|
|
if idx_path.exists():
|
|
payload = json.loads(idx_path.read_text(encoding="utf-8"))
|
|
if not isinstance(payload, dict):
|
|
payload = {"records": []}
|
|
else:
|
|
payload = {"records": []}
|
|
except Exception:
|
|
payload = {"records": []}
|
|
payload.setdefault("records", []).append(entry)
|
|
payload["total_records"] = len(payload["records"])
|
|
idx_path.write_text(
|
|
json.dumps(payload, indent=2, ensure_ascii=False),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
# ─── BRAIN FACTORY ───────────────────────────────────────
|
|
|
|
def _build_brain(name: str, audio_io, recorder, voice: str, system_prompt: str):
|
|
name = (name or "").strip().lower()
|
|
if name in ("", "gemini"):
|
|
from Project.Sanad.gemini.script import GeminiBrain
|
|
return GeminiBrain(audio_io, recorder, voice, system_prompt)
|
|
if name == "local":
|
|
from Project.Sanad.local.script import LocalBrain
|
|
return LocalBrain(audio_io, recorder, voice, system_prompt)
|
|
if name == "model":
|
|
from Project.Sanad.voice.model_script import ModelBrain
|
|
return ModelBrain(audio_io, recorder, voice, system_prompt)
|
|
# To add a provider: import the module and return its brain class here.
|
|
raise ValueError(f"unknown voice brain: {name!r}")
|
|
|
|
|
|
# ─── MAIN ────────────────────────────────────────────────
|
|
|
|
def main() -> None:
|
|
if len(sys.argv) < 2:
|
|
print(__doc__)
|
|
sys.exit(1)
|
|
|
|
iface = sys.argv[1]
|
|
voice = GEMINI_VOICE
|
|
if "--voice" in sys.argv:
|
|
voice = sys.argv[sys.argv.index("--voice") + 1]
|
|
|
|
log.info("DDS on %s", iface)
|
|
ChannelFactoryInitialize(0, iface)
|
|
ac = AudioClient()
|
|
ac.SetTimeout(10.0)
|
|
ac.Init()
|
|
log.info("AudioClient ready")
|
|
|
|
profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin")
|
|
audio = AudioIO.from_profile(profile, audio_client=ac)
|
|
audio.start()
|
|
log.info("audio profile=%s", audio.profile_id)
|
|
|
|
# Sanity-check the mic before handing it to the brain
|
|
log.info("testing mic 2s...")
|
|
time.sleep(2)
|
|
test = audio.mic.read_chunk(1024)
|
|
e = _audio_energy(test)
|
|
log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT")
|
|
|
|
recorder = TurnRecorder(enabled=RECORD_ENABLED)
|
|
if RECORD_ENABLED:
|
|
log.info("recording enabled → %s", RECORD_DIR)
|
|
|
|
system_prompt = _load_system_prompt()
|
|
brain_name = os.environ.get("SANAD_VOICE_BRAIN", "gemini")
|
|
brain = _build_brain(brain_name, audio, recorder, voice, system_prompt)
|
|
log.info("voice brain=%s voice=%s log=%s", brain_name, voice, LOG_FILE)
|
|
log.info("─" * 50)
|
|
|
|
try:
|
|
asyncio.run(brain.run())
|
|
except KeyboardInterrupt:
|
|
pass
|
|
except Exception as exc:
|
|
log.error("fatal: %s", exc)
|
|
finally:
|
|
log.info("stopping")
|
|
try:
|
|
brain.stop()
|
|
except Exception:
|
|
log.warning("brain.stop() failed", exc_info=True)
|
|
audio.stop()
|
|
log.info("stopped")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|