Marcus/Voice/marcus_gemini_voice.py

#!/usr/bin/env python3
"""
Voice/marcus_gemini_voice.py — Marcus Gemini Live Voice Module v2
==================================================================
Real-time bidirectional voice conversation using Gemini 2.5 Flash native audio.
Uses G1 built-in speaker + Hollyland wireless mic.

Based on SanadVoice/gemini_interact architecture:
- PyAudio for mic (not parec)
- Echo suppression (silence when speaking)
- Gemini VAD (automatic activity detection)
- thinkingBudget=0 (no thinking text)
- ASR buffering for full sentences
- Vision routed to brain's Qwen camera

Usage:
    from Voice.marcus_gemini_voice import GeminiVoiceModule
    voice = GeminiVoiceModule(audio_api, on_transcript=callback)
    voice.start()
"""

import array
import asyncio
import base64
import json
import logging
import os
import subprocess
import threading
import time
import numpy as np

from dotenv import load_dotenv
load_dotenv()

BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
PROJECT_NAME = "Marcus"
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)

LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
    handlers=[
        logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
        logging.StreamHandler(),
    ],
)
log = logging.getLogger("gemini_voice")


def load_config(name: str) -> dict:
    path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
    with open(path, "r") as f:
        return json.load(f)


# ─── CONFIGURATION ────────────────────────────────────────

API_KEY = "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8"
MODEL = "models/gemini-2.5-flash-native-audio-preview-12-2025"
URI = (
    "wss://generativelanguage.googleapis.com/ws/"
    "google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent"
    f"?key={API_KEY}"
)

VOICE_NAME = "Charon"
SEND_RATE = 16000
RECEIVE_RATE = 24000
CHUNK_SIZE = 512
CHANNELS = 1


def load_system_prompt():
    paths = [
        os.path.join(PROJECT_ROOT, "Config", "marcus_voice_prompt.txt"),
    ]
    for p in paths:
        if os.path.exists(p):
            with open(p, "r", encoding="utf-8-sig") as f:
                return f.read().strip()
    return (
        "You are Marcus, a humanoid robot by YS Lootah Technology, Dubai. "
        "Speak Arabic (UAE dialect) and English. Be concise — 1-2 sentences max."
    )


# ─── AUDIO HELPERS ────────────────────────────────────────

def audio_energy(pcm: bytes) -> int:
    try:
        samples = array.array("h", pcm)
        if not samples:
            return 0
        return sum(abs(s) for s in samples) // len(samples)
    except Exception:
        return 0


SILENCE_PCM = b'\x00' * (CHUNK_SIZE * 2)


# ─── GEMINI VOICE MODULE ─────────────────────────────────

class GeminiVoiceModule:
    """Real-time voice conversation via Gemini, using G1 speaker + Hollyland mic."""

    def __init__(self, audio_api, on_transcript=None):
        self._audio = audio_api
        self._on_transcript = on_transcript
        self._config = load_config("Voice")
        self._mic_source = getattr(audio_api, '_mic_source',
            self._config["mic"].get("source_index", "0"))

        # State
        self.speaking = False
        self.interrupted = False
        self._running = False
        self._thread = None
        self._audio_queue = None  # Created in async context

        # Tuning
        self.MIN_THRESHOLD = 3000
        self.barge_in_threshold = self.MIN_THRESHOLD
        self.REQUIRED_LOUD_CHUNKS = 10
        self.PREBUFFER_CHUNKS = 2
        self.PLAYBACK_TIMEOUT = 0.25
        self.BARGE_IN_COOLDOWN = 0.7
        self.AI_SPEAK_GRACE = 0.20
        self.ECHO_GUARD_SEC = 0.8
        self.SPEAKING_ENERGY_GATE = 0.85
        self.SEND_SILENCE_WHEN_SPEAKING = True

        # Timing
        self._ai_speaking_since = 0.0
        self._last_ai_audio_time = 0.0
        self._barge_in_block_until = 0.0
        self._ignore_input_until = 0.0

        # ASR buffer
        self._asr_buf = ""
        self._asr_last_time = 0.0
        self.ASR_WINDOW_SEC = 2.0

        # Find Hollyland mic PyAudio device index
        self._mic_device_idx = self._find_mic_device()

        log.info("GeminiVoiceModule v2 initialized")

    # ─── MIC DEVICE DETECTION ─────────────────────────────

    def _find_mic_device(self) -> int:
        """Find Hollyland wireless mic in PyAudio devices. Returns device index."""
        import pyaudio
        import ctypes
        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
        def _alsa_error_handler(filename, line, function, err, fmt):
            pass  # suppress
        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
        try:
            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
            asound.snd_lib_error_set_handler(c_error_handler)
        except: pass  # ALSA_suppress
        pa = pyaudio.PyAudio()
        try:
            # First: set PulseAudio default source to Hollyland
            subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
            subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)

            # Search for wireless mic by name
            for i in range(pa.get_device_count()):
                info = pa.get_device_info_by_index(i)
                name = info.get("name", "").lower()
                if info["maxInputChannels"] > 0 and ("wireless" in name or "hollyland" in name):
                    log.info("Mic found: [%d] %s (%dHz)", i, info["name"], int(info["defaultSampleRate"]))
                    return i

            # Fallback to 'default' or 'pulse' device
            for i in range(pa.get_device_count()):
                info = pa.get_device_info_by_index(i)
                if info["maxInputChannels"] > 0 and info.get("name", "") in ("default", "pulse"):
                    log.info("Mic fallback: [%d] %s", i, info["name"])
                    return i

            log.warning("No mic found, using device 0")
            return 0
        finally:
            pa.terminate()

    # ─── MIC CALIBRATION ──────────────────────────────────

    def _calibrate_mic(self):
        """Calibrate barge-in threshold from ambient noise."""
        import pyaudio
        import ctypes
        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
        def _alsa_error_handler(filename, line, function, err, fmt):
            pass  # suppress
        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
        try:
            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
            asound.snd_lib_error_set_handler(c_error_handler)
        except: pass  # ALSA_suppress
        pa = pyaudio.PyAudio()
        mic_info = pa.get_device_info_by_index(self._mic_device_idx)
        mic_rate = int(mic_info["defaultSampleRate"])
        mic_channels = 1
        try:
            stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
                             rate=mic_rate, input=True,
                             input_device_index=self._mic_device_idx,
                             frames_per_buffer=CHUNK_SIZE)
            values = []
            for _ in range(40):
                data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
                values.append(audio_energy(data))
            stream.stop_stream()
            stream.close()
            avg_noise = sum(values) / len(values) if values else 0
            self.barge_in_threshold = max(self.MIN_THRESHOLD, avg_noise * 3.0)
            log.info("Mic calibrated: noise=%.0f, threshold=%.0f", avg_noise, self.barge_in_threshold)
        except Exception as e:
            log.warning("Calibration failed: %s", e)
        finally:
            pa.terminate()

    # ─── G1 SPEAKER PLAYBACK ─────────────────────────────

    def _play_buffer_on_g1(self, pcm_24k: np.ndarray):
        """Play 24kHz audio on G1 speaker (resample to 16kHz, single call)."""
        if len(pcm_24k) < 100:
            return

        # Resample 24kHz → 16kHz
        tl = int(len(pcm_24k) * 16000 / 24000)
        audio_16k = np.interp(
            np.linspace(0, len(pcm_24k), tl, endpoint=False),
            np.arange(len(pcm_24k)),
            pcm_24k.astype(np.float64),
        ).astype(np.int16)

        from unitree_sdk2py.g1.audio.g1_audio_api import (
            ROBOT_API_ID_AUDIO_START_PLAY,
            ROBOT_API_ID_AUDIO_STOP_PLAY,
        )

        client = self._audio._client
        if not client:
            return

        app_name = "gemini"
        client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))
        time.sleep(0.1)

        pcm = audio_16k.tobytes()
        sid = f"s_{int(time.time() * 1000)}"
        param = json.dumps({
            "app_name": app_name,
            "stream_id": sid,
            "sample_rate": 16000,
            "channels": 1,
            "bits_per_sample": 16,
        })
        client._CallRequestWithParamAndBin(ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm))

        duration = len(audio_16k) / 16000
        time.sleep(duration + 0.3)
        client._Call(ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}))

    # ─── WEBSOCKET TASKS ─────────────────────────────────

    async def _capture_mic(self, ws):
        """Continuously capture mic via PyAudio and send to Gemini."""
        import pyaudio
        import ctypes
        ERROR_HANDLER_FUNC = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p)
        def _alsa_error_handler(filename, line, function, err, fmt):
            pass  # suppress
        c_error_handler = ERROR_HANDLER_FUNC(_alsa_error_handler)
        try:
            asound = ctypes.cdll.LoadLibrary("libasound.so.2")
            asound.snd_lib_error_set_handler(c_error_handler)
        except: pass  # ALSA_suppress
        pa = pyaudio.PyAudio()

        mic_info = pa.get_device_info_by_index(self._mic_device_idx)
        mic_rate = int(mic_info["defaultSampleRate"])
        mic_channels = 1

        # Open mic at native rate/channels
        stream = pa.open(format=pyaudio.paInt16, channels=mic_channels,
                         rate=mic_rate, input=True,
                         input_device_index=self._mic_device_idx,
                         frames_per_buffer=CHUNK_SIZE)

        log.info("Mic stream: device=%d, rate=%d, ch=%d", self._mic_device_idx, mic_rate, mic_channels)

        loud_chunks = 0
        loop = asyncio.get_event_loop()
        needs_resample = mic_rate != SEND_RATE or mic_channels != 1

        try:
            while self._running:
                data = await loop.run_in_executor(
                    None, lambda: stream.read(CHUNK_SIZE, exception_on_overflow=False))

                # Convert to mono 16kHz if needed
                if needs_resample:
                    audio = np.frombuffer(data, dtype=np.int16)
                    # Stereo to mono
                    if mic_channels == 2:
                        audio = audio.reshape(-1, 2).mean(axis=1).astype(np.int16)
                    # Resample to 16kHz
                    if mic_rate != SEND_RATE:
                        tl = int(len(audio) * SEND_RATE / mic_rate)
                        if tl > 0:
                            audio = np.interp(
                                np.linspace(0, len(audio), tl, endpoint=False),
                                np.arange(len(audio)),
                                audio.astype(np.float64),
                            ).astype(np.int16)
                    data = audio.tobytes()

                energy = audio_energy(data)
                now = time.time()

                # Barge-in detection
                if self.speaking and now >= self._barge_in_block_until:
                    if (now - self._ai_speaking_since) >= self.AI_SPEAK_GRACE:
                        if energy > self.barge_in_threshold:
                            loud_chunks += 1
                        else:
                            loud_chunks = 0
                        if loud_chunks > self.REQUIRED_LOUD_CHUNKS:
                            log.info("Barge-in detected!")
                            self.interrupted = True
                            self.speaking = False
                            while not self._audio_queue.empty():
                                try: self._audio_queue.get_nowait()
                                except: break
                            loud_chunks = 0
                            self._barge_in_block_until = now + self.BARGE_IN_COOLDOWN

                # Echo suppression: send silence while speaking
                data_to_send = data
                if self.SEND_SILENCE_WHEN_SPEAKING and self.speaking:
                    gate = self.barge_in_threshold * self.SPEAKING_ENERGY_GATE
                    if energy < gate:
                        data_to_send = SILENCE_PCM

                # Send to Gemini
                b64 = base64.b64encode(data_to_send).decode()
                msg = {
                    "realtime_input": {
                        "media_chunks": [
                            {"mime_type": f"audio/pcm;rate={SEND_RATE}", "data": b64}
                        ]
                    }
                }
                await ws.send(json.dumps(msg))

        except Exception as e:
            if self._running:
                log.error("Mic error: %s", e)
        finally:
            stream.stop_stream()
            stream.close()
            pa.terminate()

    async def _receive_audio(self, ws):
        """Receive audio responses and transcriptions from Gemini."""
        async for msg in ws:
            if not self._running:
                break
            try:
                response = json.loads(msg)
                server_content = response.get("serverContent", {})

                if server_content.get("interrupted"):
                    self.interrupted = False

                # User transcription (partial/streaming)
                input_tr = (
                    server_content.get("inputTranscription")
                    or server_content.get("input_transcription")
                    or server_content.get("inputAudioTranscription")
                    or server_content.get("input_audio_transcription")
                )
                if isinstance(input_tr, dict):
                    text = (input_tr.get("text") or "").strip()
                    now = time.time()
                    if text and now >= self._ignore_input_until and not self.speaking:
                        # Buffer ASR text
                        if now - self._asr_last_time > self.ASR_WINDOW_SEC:
                            self._asr_buf = ""
                        self._asr_buf = text  # Gemini sends cumulative transcription
                        self._asr_last_time = now

                if self.interrupted:
                    continue

                # Audio from Gemini
                model_turn = server_content.get("modelTurn")
                if model_turn:
                    for part in model_turn.get("parts", []):
                        inline_data = part.get("inlineData")
                        if inline_data:
                            audio_b64 = inline_data.get("data")
                            if audio_b64:
                                now = time.time()
                                if not self.speaking:
                                    self._ai_speaking_since = now
                                    # Gemini started responding — fire transcript callback
                                    if self._asr_buf and self._on_transcript:
                                        self._on_transcript(self._asr_buf, "user")
                                self.speaking = True
                                self._last_ai_audio_time = now
                                self._ignore_input_until = now + self.ECHO_GUARD_SEC
                                audio_bytes = base64.b64decode(audio_b64)
                                await self._audio_queue.put(audio_bytes)

                        # Text from Gemini (thinking/response text)
                        text_part = part.get("text", "").strip()
                        if text_part and self._on_transcript:
                            self._on_transcript(text_part, "marcus")

                # Turn complete — Gemini finished speaking
                turn_complete = server_content.get("turnComplete")
                if turn_complete:
                    # Clear ASR buffer after turn
                    self._asr_buf = ""

            except Exception as e:
                log.error("Receive error: %s", e)

    async def _play_audio(self):
        """Collect Gemini audio chunks and play on G1 speaker."""
        while self._running:
            try:
                if not self.speaking:
                    await asyncio.sleep(0.05)
                    continue

                # Pre-buffer
                buffered = False
                while self.speaking and not buffered:
                    if self._audio_queue.qsize() >= self.PREBUFFER_CHUNKS:
                        buffered = True
                    else:
                        await asyncio.sleep(0.01)

                # Collect all audio chunks
                buffer_chunks = []
                while self.speaking:
                    try:
                        data = await asyncio.wait_for(
                            self._audio_queue.get(), timeout=self.PLAYBACK_TIMEOUT)
                        audio = np.frombuffer(data, dtype=np.int16)
                        buffer_chunks.append(audio)
                        self._last_ai_audio_time = time.time()
                    except asyncio.TimeoutError:
                        if self._audio_queue.empty():
                            if time.time() - self._last_ai_audio_time > 0.3:
                                break

                # Play on G1 speaker
                if buffer_chunks:
                    full_audio = np.concatenate(buffer_chunks)
                    duration = len(full_audio) / RECEIVE_RATE
                    log.info("Playing %.1fs on G1", duration)

                    await asyncio.get_event_loop().run_in_executor(
                        None, self._play_buffer_on_g1, full_audio)

                self.speaking = False

            except Exception as e:
                log.error("Play error: %s", e)
                self.speaking = False

    # ─── MAIN LOOP ────────────────────────────────────────

    async def _run_async(self):
        import websockets
        import inspect

        system_prompt = load_system_prompt()

        # Unmute mic
        subprocess.run(["pactl", "set-source-mute", self._mic_source, "0"], capture_output=True)
        subprocess.run(["pactl", "set-source-volume", self._mic_source, "100%"], capture_output=True)

        # Calibrate
        self._calibrate_mic()

        ws_kwargs = {"max_size": None}
        try:
            sig = inspect.signature(websockets.connect)
            if "extra_headers" in sig.parameters:
                ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}
            else:
                ws_kwargs["additional_headers"] = {"Content-Type": "application/json"}
        except Exception:
            ws_kwargs["extra_headers"] = {"Content-Type": "application/json"}

        while self._running:
            try:
                log.info("Connecting to Gemini...")
                async with websockets.connect(URI, **ws_kwargs) as ws:
                    setup_msg = {
                        "setup": {
                            "model": MODEL,
                            "generationConfig": {
                                "responseModalities": ["AUDIO"],
                                "thinkingConfig": {"thinkingBudget": 0},
                                "speechConfig": {
                                    "voiceConfig": {
                                        "prebuiltVoiceConfig": {"voiceName": VOICE_NAME}
                                    }
                                },
                            },
                            "realtimeInputConfig": {
                                "automaticActivityDetection": {
                                    "startOfSpeechSensitivity": "START_SENSITIVITY_HIGH",
                                    "prefixPaddingMs": 40,
                                    "endOfSpeechSensitivity": "END_SENSITIVITY_HIGH",
                                    "silenceDurationMs": 250,
                                }
                            },
                            "inputAudioTranscription": {},
                            "systemInstruction": {"parts": [{"text": system_prompt}]},
                        }
                    }
                    await ws.send(json.dumps(setup_msg))
                    await ws.recv()
                    log.info("Connected! Always listening...")

                    self._audio_queue = asyncio.Queue()

                    await asyncio.gather(
                        self._capture_mic(ws),
                        self._receive_audio(ws),
                        self._play_audio(),
                    )

            except Exception as e:
                if self._running:
                    log.error("Connection error: %s — reconnecting in 3s", e)
                    await asyncio.sleep(3)

    def _voice_thread(self):
        asyncio.run(self._run_async())

    # ─── START / STOP ─────────────────────────────────────

    def start(self):
        if self._running:
            return
        self._running = True
        self._thread = threading.Thread(target=self._voice_thread, daemon=True, name="gemini_voice")
        self._thread.start()
        log.info("Gemini voice module started")

    def stop(self):
        self._running = False
        if self._thread:
            self._thread.join(timeout=5)
            self._thread = None
        log.info("Gemini voice module stopped")

    @property
    def is_running(self) -> bool:
        return self._running

    @property
    def state(self) -> str:
        return "LISTENING" if self._running else "STOPPED"

    @property
    def is_speaking(self) -> bool:
        return self.speaking


# ─── STANDALONE TEST ─────────────────────────────────────

if __name__ == "__main__":
    import sys
    sys.path.insert(0, PROJECT_ROOT)
    from API.audio_api import AudioAPI

    def on_transcript(text, role):
        print(f"  [{role.upper()}] {text}")

    audio = AudioAPI()
    voice = GeminiVoiceModule(audio, on_transcript=on_transcript)

    print("Gemini voice v2 — speak anytime. Ctrl+C to stop.\n")
    voice.start()

    try:
        while voice.is_running:
            time.sleep(0.5)
    except KeyboardInterrupt:
        print("\nStopping...")
        voice.stop()