Sanad/voice/sanad_voice.py

#!/usr/bin/env python3
"""Sanad voice subprocess — orchestrator.

Wires three independently-swappable pieces together:

  1. Audio I/O     — voice/audio_io.py           (mic + speaker)
  2. Turn recorder — TurnRecorder (in this file; model-agnostic WAV capture)
  3. Voice brain   — gemini/script.py            (Gemini, default — cloud)
                     local/script.py             (offline — Whisper+Qwen+CosyVoice2)
                     voice/model_script.py       (template for new models)

Runtime selection:
  SANAD_AUDIO_PROFILE  = builtin | anker | hollyland_builtin   (default builtin)
  SANAD_VOICE_BRAIN    = gemini  | local | model               (default gemini)

Usage:
    python3 voice/sanad_voice.py eth0
    python3 voice/sanad_voice.py eth0 --voice Charon
    SANAD_AUDIO_PROFILE=anker SANAD_VOICE_BRAIN=gemini \\
        python3 voice/sanad_voice.py eth0

System prompt priority (first hit wins):
  1. scripts/sanad_script.txt  (edit-live via the dashboard)
  2. config/core_config.json > gemini_defaults.default_system_prompt
  3. the hardcoded fallback in _load_system_prompt() below
"""

from __future__ import annotations

import array
import asyncio
import json
import logging
import os
import sys
import threading
import time
import wave
from datetime import datetime
from pathlib import Path

from unitree_sdk2py.core.channel import ChannelFactoryInitialize
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient

from Project.Sanad.config import (
    GEMINI_VOICE,
    RECEIVE_SAMPLE_RATE,
    SCRIPTS_DIR,
    SEND_SAMPLE_RATE,
)
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.voice.audio_io import AudioIO

# ─── LOGGING ─────────────────────────────────────────────

_LOG_CFG = _cfg_section("voice", "sanad_voice")
LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs"))
os.makedirs(LOG_DIR, exist_ok=True)
_LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2")
LOG_FILE = os.path.join(LOG_DIR, f"{_LOG_NAME}_{datetime.now():%Y%m%d}.log")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler(),
    ],
)
log = logging.getLogger("sanad_voice")


# ─── CONFIG ──────────────────────────────────────────────

_REC = _cfg_section("voice", "recording")
_SCRIPTS = _cfg_section("core", "script_files")
_GEMINI_DEFAULTS = _cfg_section("core", "gemini_defaults")

_PERSONA_FILE = SCRIPTS_DIR / _SCRIPTS.get("persona", "sanad_script.txt")

RECORD_ENABLED = os.environ.get(
    "SANAD_RECORD",
    "1" if _REC.get("enabled", True) else "0",
) != "0"
_REC_DIR_REL = _REC.get("dir_relative", "data/recordings")
RECORD_DIR = Path(os.environ.get(
    "SANAD_RECORD_DIR",
    str(Path(__file__).resolve().parent.parent / _REC_DIR_REL),
))

_FALLBACK_SYSTEM_PROMPT = (
    "You are Marcus, a bilingual humanoid robot assistant made by YS Lootah "
    "Technology, Dubai, UAE. RESPOND IN ARABIC (Gulf/Emirati dialect) OR "
    "ENGLISH ONLY. YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE "
    "USER SPEAKS. If the user speaks Arabic, you MUST reply in Arabic Gulf "
    "dialect. If the user speaks English, you MUST reply in English. Do NOT "
    "confuse Arabic with Japanese, Hindi, Russian, or any other language. "
    "The user is speaking Arabic or English — nothing else. Be concise — 1 "
    "to 2 sentences max. Be friendly and natural. If the user interrupts "
    "and says 'continue' or 'كمل', resume EXACTLY where you stopped. Only "
    "respond to clear human speech. Ignore background noise and silence "
    "completely. Do not respond to sounds that are not words."
)


def _load_system_prompt() -> str:
    """scripts/sanad_script.txt → config default → hardcoded fallback."""
    try:
        text = _PERSONA_FILE.read_text(encoding="utf-8-sig").strip()
        if text:
            return text
    except FileNotFoundError:
        pass
    return _GEMINI_DEFAULTS.get("default_system_prompt", _FALLBACK_SYSTEM_PROMPT)


def _audio_energy(pcm: bytes) -> int:
    try:
        samples = array.array("h", pcm)
        return sum(abs(s) for s in samples) // len(samples) if samples else 0
    except Exception:
        return 0


# ─── TURN RECORDER ──────────────────────────────────────

class TurnRecorder:
    """Saves each turn as two WAV files: user mic + model output.

    A turn starts when user audio starts flowing through `capture_user`
    and ends on `finish_turn`. Files land in `RECORD_DIR` as
    `<timestamp>_user.wav` (at `user_rate`) and `<timestamp>_robot.wav`
    (at `robot_rate`). An `index.json` in the same directory tracks
    every turn with timestamp + transcripts + durations for the dashboard.
    """

    def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR,
                 user_rate: int = SEND_SAMPLE_RATE,
                 robot_rate: int = RECEIVE_SAMPLE_RATE):
        self.enabled = enabled
        self.out_dir = out_dir
        self.user_rate = user_rate
        self.robot_rate = robot_rate
        if self.enabled:
            self.out_dir.mkdir(parents=True, exist_ok=True)
        self._lock = threading.Lock()
        self._user_buf: list[bytes] = []
        self._robot_buf: list[bytes] = []
        self._user_text = ""
        self._robot_text = ""
        self._started_at: float = 0.0

    def capture_user(self, pcm: bytes) -> None:
        if not self.enabled or not pcm:
            return
        with self._lock:
            if not self._user_buf and not self._robot_buf:
                self._started_at = time.time()
            self._user_buf.append(pcm)

    def capture_robot(self, pcm: bytes) -> None:
        if not self.enabled or not pcm:
            return
        with self._lock:
            if not self._user_buf and not self._robot_buf:
                self._started_at = time.time()
            self._robot_buf.append(pcm)

    def add_user_text(self, text: str) -> None:
        if text and self.enabled:
            with self._lock:
                self._user_text = (self._user_text + " " + text).strip()

    def add_robot_text(self, text: str) -> None:
        if text and self.enabled:
            with self._lock:
                self._robot_text = (self._robot_text + " " + text).strip()

    def finish_turn(self) -> dict:
        if not self.enabled:
            return {}
        with self._lock:
            user_data = b"".join(self._user_buf)
            robot_data = b"".join(self._robot_buf)
            user_text = self._user_text
            robot_text = self._robot_text
            started_at = self._started_at
            self._user_buf.clear()
            self._robot_buf.clear()
            self._user_text = ""
            self._robot_text = ""

        if not user_data and not robot_data:
            return {}

        stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S")
        entry = {"timestamp": stamp, "started_at": started_at,
                 "user_text": user_text, "robot_text": robot_text}
        try:
            if user_data:
                p = self.out_dir / f"{stamp}_user.wav"
                self._save_wav(p, user_data, self.user_rate)
                entry["user_wav"] = str(p)
                entry["user_duration_sec"] = round(
                    len(user_data) / (self.user_rate * 2), 3)
            if robot_data:
                p = self.out_dir / f"{stamp}_robot.wav"
                self._save_wav(p, robot_data, self.robot_rate)
                entry["robot_wav"] = str(p)
                entry["robot_duration_sec"] = round(
                    len(robot_data) / (self.robot_rate * 2), 3)
            self._append_index(entry)
            log.info("recorded turn → %s (user %.1fs, robot %.1fs)",
                     stamp,
                     entry.get("user_duration_sec", 0),
                     entry.get("robot_duration_sec", 0))
        except Exception as exc:
            log.warning("recording save failed: %s", exc)
        return entry

    @staticmethod
    def _save_wav(path: Path, pcm: bytes, rate: int) -> None:
        with wave.open(str(path), "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(rate)
            wf.writeframes(pcm)

    def _append_index(self, entry: dict) -> None:
        idx_path = self.out_dir / "index.json"
        try:
            if idx_path.exists():
                payload = json.loads(idx_path.read_text(encoding="utf-8"))
                if not isinstance(payload, dict):
                    payload = {"records": []}
            else:
                payload = {"records": []}
        except Exception:
            payload = {"records": []}
        payload.setdefault("records", []).append(entry)
        payload["total_records"] = len(payload["records"])
        idx_path.write_text(
            json.dumps(payload, indent=2, ensure_ascii=False),
            encoding="utf-8",
        )


# ─── BRAIN FACTORY ───────────────────────────────────────

def _build_brain(name: str, audio_io, recorder, voice: str, system_prompt: str):
    name = (name or "").strip().lower()
    if name in ("", "gemini"):
        from Project.Sanad.gemini.script import GeminiBrain
        return GeminiBrain(audio_io, recorder, voice, system_prompt)
    if name == "local":
        from Project.Sanad.local.script import LocalBrain
        return LocalBrain(audio_io, recorder, voice, system_prompt)
    if name == "model":
        from Project.Sanad.voice.model_script import ModelBrain
        return ModelBrain(audio_io, recorder, voice, system_prompt)
    # To add a provider: import the module and return its brain class here.
    raise ValueError(f"unknown voice brain: {name!r}")


# ─── MAIN ────────────────────────────────────────────────

def main() -> None:
    if len(sys.argv) < 2:
        print(__doc__)
        sys.exit(1)

    iface = sys.argv[1]
    voice = GEMINI_VOICE
    if "--voice" in sys.argv:
        voice = sys.argv[sys.argv.index("--voice") + 1]

    log.info("DDS on %s", iface)
    ChannelFactoryInitialize(0, iface)
    ac = AudioClient()
    ac.SetTimeout(10.0)
    ac.Init()
    log.info("AudioClient ready")

    profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin")
    audio = AudioIO.from_profile(profile, audio_client=ac)
    audio.start()
    log.info("audio profile=%s", audio.profile_id)

    # Sanity-check the mic before handing it to the brain
    log.info("testing mic 2s...")
    time.sleep(2)
    test = audio.mic.read_chunk(1024)
    e = _audio_energy(test)
    log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT")

    recorder = TurnRecorder(enabled=RECORD_ENABLED)
    if RECORD_ENABLED:
        log.info("recording enabled → %s", RECORD_DIR)

    system_prompt = _load_system_prompt()
    brain_name = os.environ.get("SANAD_VOICE_BRAIN", "gemini")
    brain = _build_brain(brain_name, audio, recorder, voice, system_prompt)
    log.info("voice brain=%s  voice=%s  log=%s", brain_name, voice, LOG_FILE)
    log.info("─" * 50)

    try:
        asyncio.run(brain.run())
    except KeyboardInterrupt:
        pass
    except Exception as exc:
        log.error("fatal: %s", exc)
    finally:
        log.info("stopping")
        try:
            brain.stop()
        except Exception:
            log.warning("brain.stop() failed", exc_info=True)
        audio.stop()
        log.info("stopped")


if __name__ == "__main__":
    main()