Sanad/voice/sanad_voice.py

#!/usr/bin/env python3
"""Sanad voice subprocess — Gemini Live (google-genai SDK) on the G1.

Mic/speaker are selected at startup via `SANAD_AUDIO_PROFILE` (builtin |
anker | hollyland_builtin), materialised by `voice/audio_io.py`. The
default ("builtin") is UDP multicast mic + AudioClient.PlayStream.

Features: mic gain, echo suppression, barge-in, wait-for-user,
streaming playback, per-turn WAV recording.

Usage:
    python3 voice/sanad_voice.py eth0
    python3 voice/sanad_voice.py eth0 --voice Charon
    SANAD_AUDIO_PROFILE=anker python3 voice/sanad_voice.py eth0
"""

import array
import asyncio
import json
import logging
import os
import sys
import threading
import time
import wave
from datetime import datetime
from pathlib import Path

import numpy as np

from google import genai
from google.genai import types

from unitree_sdk2py.core.channel import ChannelFactoryInitialize
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient

from Project.Sanad.voice.audio_io import AudioIO, Mic, Speaker

# ─── LOGGING ─────────────────────────────────────────────

try:
    from Project.Sanad.core.config_loader import section as _cfg_section_log
    _LOG_CFG = _cfg_section_log("voice", "sanad_voice")
except Exception:
    _LOG_CFG = {}
LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs"))
os.makedirs(LOG_DIR, exist_ok=True)
_LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2")
LOG_FILE = os.path.join(LOG_DIR, f"{_LOG_NAME}_{datetime.now():%Y%m%d}.log")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
    handlers=[
        logging.FileHandler(LOG_FILE),
        logging.StreamHandler(),
    ],
)
log = logging.getLogger("gemini_v2")

# ─── CONFIG — single source of truth ─────────────────────
#
# Gemini credentials + audio rates live in config/core_config.json
# (exposed via config.py as GEMINI_API_KEY, GEMINI_MODEL, etc).
# Voice-loop-specific tunables live in config/voice_config.json.
try:
    from Project.Sanad.config import (
        GEMINI_API_KEY, GEMINI_VOICE,
        SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE,
    )
    from Project.Sanad.core.config_loader import section as _cfg_section
    _SV = _cfg_section("voice", "sanad_voice")
    _MIC = _cfg_section("voice", "mic_udp")
    _SP = _cfg_section("voice", "speaker")
    _REC = _cfg_section("voice", "recording")
except Exception:
    GEMINI_API_KEY, GEMINI_VOICE = "", "Charon"
    SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE = 16000, 24000, 512
    _SV, _MIC, _SP, _REC = {}, {}, {}, {}

API_KEY = GEMINI_API_KEY
# Gemini Live model name (without "models/" prefix expected by google-genai SDK)
MODEL = os.environ.get("SANAD_GEMINI_MODEL",
    "gemini-2.5-flash-native-audio-preview-12-2025")
VOICE_NAME = GEMINI_VOICE

SEND_RATE = SEND_SAMPLE_RATE
RECEIVE_RATE = RECEIVE_SAMPLE_RATE
CHUNK_SAMPLES = CHUNK_SIZE
MIC_GAIN = _SV.get("mic_gain", 1.0)

PLAY_CHUNK = _SV.get("play_chunk_bytes", 96000)
SILENCE_PCM = b'\x00' * (CHUNK_SAMPLES * 2)

# ─── RECORDING ───────────────────────────────────────────
RECORD_ENABLED = os.environ.get("SANAD_RECORD",
    "1" if _REC.get("enabled", True) else "0") != "0"
_rec_dir_rel = _REC.get("dir_relative", "data/recordings")
RECORD_DIR = Path(
    os.environ.get(
        "SANAD_RECORD_DIR",
        str(Path(__file__).resolve().parent.parent / _rec_dir_rel),
    )
)

SYSTEM_PROMPT = (
    "You are Marcus, a bilingual humanoid robot assistant made by YS Lootah Technology, Dubai, UAE. "
    "RESPOND IN ARABIC (Gulf/Emirati dialect) OR ENGLISH ONLY. "
    "YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE USER SPEAKS. "
    "If the user speaks Arabic, you MUST reply in Arabic Gulf dialect. "
    "If the user speaks English, you MUST reply in English. "
    "Do NOT confuse Arabic with Japanese, Hindi, Russian, or any other language. "
    "The user is speaking Arabic or English — nothing else. "
    "Be concise — 1 to 2 sentences max. Be friendly and natural. "
    "If the user interrupts and says 'continue' or 'كمل', resume EXACTLY where you stopped. "
    "Only respond to clear human speech. Ignore background noise and silence completely. "
    "Do not respond to sounds that are not words."
)


# ─── HELPERS ─────────────────────────────────────────────

def audio_energy(pcm: bytes) -> int:
    try:
        samples = array.array("h", pcm)
        return sum(abs(s) for s in samples) // len(samples) if samples else 0
    except Exception:
        return 0


# ─── TURN RECORDER ──────────────────────────────────────

class TurnRecorder:
    """Saves each turn as two WAV files: user mic + Gemini output.

    A turn starts when user audio starts flowing through `capture_user`
    and ends on `turn_complete`. Files land in SANAD_RECORD_DIR as
    `<timestamp>_user.wav` (16 kHz) and `<timestamp>_robot.wav` (24 kHz).

    An `index.json` maintains a list of all turns with metadata
    (timestamp, text transcripts, durations) so the dashboard can
    browse them later.
    """

    def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR):
        self.enabled = enabled
        self.out_dir = out_dir
        if self.enabled:
            self.out_dir.mkdir(parents=True, exist_ok=True)
        self._lock = threading.Lock()
        self._user_buf: list[bytes] = []
        self._robot_buf: list[bytes] = []
        self._user_text = ""
        self._robot_text = ""
        self._started_at: float = 0.0

    def capture_user(self, pcm: bytes) -> None:
        if not self.enabled or not pcm:
            return
        with self._lock:
            if not self._user_buf and not self._robot_buf:
                self._started_at = time.time()
            self._user_buf.append(pcm)

    def capture_robot(self, pcm: bytes) -> None:
        if not self.enabled or not pcm:
            return
        with self._lock:
            if not self._user_buf and not self._robot_buf:
                self._started_at = time.time()
            self._robot_buf.append(pcm)

    def add_user_text(self, text: str) -> None:
        if text and self.enabled:
            with self._lock:
                self._user_text = (self._user_text + " " + text).strip()

    def add_robot_text(self, text: str) -> None:
        if text and self.enabled:
            with self._lock:
                self._robot_text = (self._robot_text + " " + text).strip()

    def finish_turn(self) -> dict:
        """Save current buffers to disk, reset state, return metadata."""
        if not self.enabled:
            return {}
        with self._lock:
            user_data = b"".join(self._user_buf)
            robot_data = b"".join(self._robot_buf)
            user_text = self._user_text
            robot_text = self._robot_text
            started_at = self._started_at
            self._user_buf.clear()
            self._robot_buf.clear()
            self._user_text = ""
            self._robot_text = ""

        if not user_data and not robot_data:
            return {}

        stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S")
        entry = {"timestamp": stamp, "started_at": started_at,
                 "user_text": user_text, "robot_text": robot_text}
        try:
            if user_data:
                user_path = self.out_dir / f"{stamp}_user.wav"
                self._save_wav(user_path, user_data, SEND_RATE)
                entry["user_wav"] = str(user_path)
                entry["user_duration_sec"] = round(len(user_data) / (SEND_RATE * 2), 3)
            if robot_data:
                robot_path = self.out_dir / f"{stamp}_robot.wav"
                self._save_wav(robot_path, robot_data, RECEIVE_RATE)
                entry["robot_wav"] = str(robot_path)
                entry["robot_duration_sec"] = round(len(robot_data) / (RECEIVE_RATE * 2), 3)
            self._append_index(entry)
            log.info("recorded turn → %s (user %.1fs, robot %.1fs)",
                     stamp,
                     entry.get("user_duration_sec", 0),
                     entry.get("robot_duration_sec", 0))
        except Exception as exc:
            log.warning("recording save failed: %s", exc)
        return entry

    def _save_wav(self, path: Path, pcm: bytes, rate: int) -> None:
        with wave.open(str(path), "wb") as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(rate)
            wf.writeframes(pcm)

    def _append_index(self, entry: dict) -> None:
        idx_path = self.out_dir / "index.json"
        try:
            if idx_path.exists():
                payload = json.loads(idx_path.read_text(encoding="utf-8"))
                if not isinstance(payload, dict):
                    payload = {"records": []}
            else:
                payload = {"records": []}
        except Exception:
            payload = {"records": []}
        payload.setdefault("records", []).append(entry)
        payload["total_records"] = len(payload["records"])
        idx_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
                            encoding="utf-8")


# Mic + speaker classes now live in voice/audio_io.py — built via
# AudioIO.from_profile() in main(). Selected with SANAD_AUDIO_PROFILE
# (builtin | anker | hollyland_builtin).


# ─── SESSION ─────────────────────────────────────────────

async def run_session(mic: Mic, speaker: Speaker, voice: str):
    client = genai.Client(api_key=API_KEY)
    recorder = TurnRecorder(enabled=RECORD_ENABLED)
    if RECORD_ENABLED:
        log.info("recording enabled → %s", RECORD_DIR)

    config = types.LiveConnectConfig(
        response_modalities=["AUDIO"],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name=voice
                )
            )
        ),
        realtime_input_config=types.RealtimeInputConfig(
            automatic_activity_detection=types.AutomaticActivityDetection(
                disabled=False,
                start_of_speech_sensitivity=getattr(
                    types.StartSensitivity,
                    _cfg_section("voice", "vad").get(
                        "start_sensitivity", "START_SENSITIVITY_HIGH")),
                end_of_speech_sensitivity=getattr(
                    types.EndSensitivity,
                    _cfg_section("voice", "vad").get(
                        "end_sensitivity", "END_SENSITIVITY_LOW")),
                prefix_padding_ms=_cfg_section("voice", "vad").get("prefix_padding_ms", 20),
                silence_duration_ms=_cfg_section("voice", "vad").get("silence_duration_ms", 200),
            )
        ),
        input_audio_transcription=types.AudioTranscriptionConfig(),
        output_audio_transcription=types.AudioTranscriptionConfig(),
        system_instruction=types.Content(
            parts=[types.Part(text=SYSTEM_PROMPT)]
        ),
    )

    session_num = 0
    start_time = time.time()
    consecutive_errors = 0

    while True:
        session_num += 1
        speaking = False
        stream_started = False
        barge_block_until = 0.0
        ai_speak_start = 0.0
        last_ai_audio = 0.0

        _bi = _cfg_section("voice", "barge_in")
        BARGE_THRESHOLD = _bi.get("threshold", 500)
        LOUD_CHUNKS_NEEDED = _bi.get("loud_chunks_needed", 3)
        BARGE_COOLDOWN = _bi.get("cooldown_sec", 0.3)
        ECHO_SUPPRESS_BELOW = _bi.get("echo_suppress_below", 500)
        AI_SPEAK_GRACE_SEC = _bi.get("ai_speak_grace_sec", 0.15)

        uptime_min = (time.time() - start_time) / 60

        try:
            log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
                      session_num, uptime_min)
            async with client.aio.live.connect(model=MODEL, config=config) as session:
                log.info("connected — speak anytime!")
                consecutive_errors = 0  # reset on successful connect
                mic.flush()
                done = asyncio.Event()

                # ── Send mic ──
                async def send_mic():
                    nonlocal speaking, barge_block_until
                    chunk_bytes = CHUNK_SAMPLES * 2
                    loud_count = 0
                    last_activity = time.time()
                    loop = asyncio.get_event_loop()

                    while not done.is_set():
                        try:
                            raw = await loop.run_in_executor(
                                None, lambda: mic.read_chunk(chunk_bytes))
                        except Exception:
                            break

                        # Amplify
                        samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
                        samples = np.clip(samples * MIC_GAIN, -32768, 32767).astype(np.int16)
                        data = samples.tobytes()
                        energy = audio_energy(data)
                        now = time.time()

                        # Barge-in
                        if speaking and now >= barge_block_until:
                            if (now - ai_speak_start) >= AI_SPEAK_GRACE_SEC:
                                if energy > BARGE_THRESHOLD:
                                    loud_count += 1
                                else:
                                    loud_count = max(0, loud_count - 1)
                                if loud_count > LOUD_CHUNKS_NEEDED:
                                    log.info("BARGE-IN (e=%d)", energy)
                                    do_interrupt("barge-in")
                                    loud_count = 0
                                    barge_block_until = now + BARGE_COOLDOWN

                        # Echo suppression
                        send_data = data
                        if speaking and energy < ECHO_SUPPRESS_BELOW:
                            send_data = SILENCE_PCM[:chunk_bytes]

                        # Record user audio (only when clearly speaking,
                        # energy > 250 — skip ambient silence noise)
                        if energy > 250 and not speaking:
                            recorder.capture_user(data)

                        # Watchdog
                        if energy > 250:
                            last_activity = now
                        elif now - last_activity > 10:
                            log.info("alive (no speech %.0fs, e=%d, buf=%d)",
                                     now - last_activity, energy, len(mic._buf))
                            last_activity = now

                        try:
                            await session.send_realtime_input(
                                audio=types.Blob(
                                    data=send_data,
                                    mime_type=f"audio/pcm;rate={SEND_RATE}"
                                )
                            )
                        except asyncio.CancelledError:
                            return
                        except Exception as e:
                            log.warning("mic send failed: %s — ending session", e)
                            done.set()
                            return

                        await asyncio.sleep(CHUNK_SAMPLES / SEND_RATE)

                    log.info("send_mic task ended")

                # ── Interrupt helper ──
                def do_interrupt(source="local"):
                    nonlocal speaking, stream_started
                    speaking = False
                    stream_started = False
                    speaker.stop()
                    mic.flush()
                    recorder.finish_turn()

                # ── Receive ──
                async def receive():
                    nonlocal speaking, stream_started
                    nonlocal ai_speak_start, last_ai_audio
                    loop = asyncio.get_event_loop()

                    try:
                        last_recv = time.time()
                        while not done.is_set():
                            async for response in session.receive():
                                last_recv = time.time()
                                if done.is_set():
                                    break

                                # Server going away — reconnect soon
                                if hasattr(response, 'go_away') and response.go_away is not None:
                                    log.info("server going away — will reconnect")
                                    done.set()
                                    return

                                sc = response.server_content
                                if sc is None:
                                    continue

                                # Gemini interrupted
                                if sc.interrupted is True:
                                    if speaking:
                                        log.info("Gemini interrupted")
                                        do_interrupt("gemini")
                                    continue

                                # User transcript
                                if sc.input_transcription:
                                    text = (sc.input_transcription.text or "").strip()
                                    if text and not speaking:
                                        log.info("USER: %s", text)
                                        recorder.add_user_text(text)

                                # Marcus transcript
                                if sc.output_transcription:
                                    text = (sc.output_transcription.text or "").strip()
                                    if text:
                                        log.info("MARCUS: %s", text)
                                        recorder.add_robot_text(text)

                                # AI audio
                                if sc.model_turn:
                                    for part in sc.model_turn.parts:
                                        if part.inline_data and part.inline_data.data:
                                            now = time.time()
                                            if not speaking:
                                                ai_speak_start = now
                                                speaking = True
                                            last_ai_audio = now
                                            raw_audio = part.inline_data.data
                                            recorder.capture_robot(raw_audio)
                                            audio = np.frombuffer(
                                                raw_audio, dtype=np.int16)
                                            if not stream_started:
                                                await loop.run_in_executor(
                                                    None, speaker.begin_stream)
                                                stream_started = True
                                            await loop.run_in_executor(
                                                None, speaker.send_chunk,
                                                audio, RECEIVE_RATE)

                                # Turn complete
                                if sc.turn_complete:
                                    if speaking and stream_started and not speaker.interrupted:
                                        dur = speaker.total_sent_sec
                                        log.info("speaker %.1fs", dur)
                                        await loop.run_in_executor(
                                            None, speaker.wait_finish)
                                    elif speaking and speaker.interrupted:
                                        log.info("speaker interrupted")
                                    speaking = False
                                    stream_started = False
                                    mic.flush()
                                    recorder.finish_turn()
                                    log.info("listening")

                            # receive() iterator ended — check if session is still alive
                            if time.time() - last_recv > 30:
                                log.warning("no messages from Gemini for 30s — session dead")
                                break
                            await asyncio.sleep(0.1)

                    except Exception as e:
                        log.warning("receive ended: %s", e)
                    finally:
                        done.set()

                try:
                    await asyncio.wait_for(
                        asyncio.gather(send_mic(), receive()),
                        timeout=_SV.get("session_timeout_sec", 660),  # 11 min max (server go_away at ~10 min)
                    )
                except asyncio.TimeoutError:
                    log.warning("session timed out after 11 min")
                except asyncio.CancelledError:
                    log.warning("session cancelled")

                log.info("session #%d ended — reconnecting in 1s", session_num)
                speaker.stop()
                mic.flush()
                await asyncio.sleep(1)

        except asyncio.CancelledError:
            log.info("cancelled — stopping")
            break
        except KeyboardInterrupt:
            log.info("keyboard interrupt — stopping")
            break
        except Exception as e:
            consecutive_errors += 1
            # Exponential backoff: 2s, 4s, 8s, 16s, max 30s
            delay = min(30, 2 ** consecutive_errors)
            log.error("session error (#%d): %s — reconnecting in %ds",
                       consecutive_errors, e, delay)
            await asyncio.sleep(delay)

            # After 10 consecutive errors, restart the client
            if consecutive_errors >= 10:
                log.warning("10 consecutive errors — recreating client")
                try:
                    client = genai.Client(api_key=API_KEY)
                    consecutive_errors = 0
                except Exception as ce:
                    log.error("client recreation failed: %s", ce)


# ─── MAIN ────────────────────────────────────────────────

def main():
    if len(sys.argv) < 2:
        print(__doc__)
        sys.exit(1)

    iface = sys.argv[1]
    voice = VOICE_NAME
    if "--voice" in sys.argv:
        idx = sys.argv.index("--voice")
        voice = sys.argv[idx + 1]

    log.info("DDS on %s", iface)
    ChannelFactoryInitialize(0, iface)
    ac = AudioClient()
    ac.SetTimeout(10.0)
    ac.Init()
    log.info("AudioClient ready")

    profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin")
    audio = AudioIO.from_profile(profile, audio_client=ac)
    audio.start()
    mic, speaker = audio.mic, audio.speaker
    log.info("audio profile=%s", audio.profile_id)

    log.info("testing mic 2s...")
    time.sleep(2)
    test = mic.read_chunk(1024)
    e = audio_energy(test)
    log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT")

    log.info("voice=%s  log=%s", voice, LOG_FILE)
    log.info("─" * 50)

    try:
        asyncio.run(run_session(mic, speaker, voice))
    except KeyboardInterrupt:
        pass
    except Exception as e:
        log.error("fatal: %s", e)
    finally:
        log.info("stopped")
        audio.stop()


if __name__ == "__main__":
    main()