Marcus/API/audio_api.py

#!/usr/bin/env python3
"""
API/audio_api.py — Marcus Audio API Layer
==========================================
Provides speak() and record() for the Brain layer.
Brain imports ONLY from this API — never from unitree SDK directly.

Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only,
         no MP3/WAV plumbing, no internet).  Optional raw-PCM playback path
         via _play_pcm() is kept for future modules that synthesize their
         own audio (e.g. offline Piper).
Mic:     G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono).
         Legacy Hollyland/parec path retained as fallback when
         config_Voice.json has mic.backend="pactl_parec".
TTS:     English only.  Arabic is rejected (the G1 firmware silently maps
         Arabic to Chinese, which confuses everyone — if Arabic TTS is ever
         needed again, use a separate offline backend like Piper).

Usage:
    from API.audio_api import AudioAPI
    audio = AudioAPI()
    audio.speak("Hello, I am Sanad")
    recording = audio.record(seconds=5)
    audio.play_pcm(recording)
"""

import json
import logging
import os
import subprocess
import sys
import threading
import time
import wave
from logging.handlers import RotatingFileHandler

import numpy as np

# ─── PATH + CONFIG ───────────────────────────────────────
# Use the canonical loaders from Core/ so path + config logic lives in one place.
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
    sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config

LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)

# All voice-subsystem logs go ONLY to logs/voice.log, not stdout — the
# terminal REPL needs a clean `Command:` prompt. Anything the operator
# needs to see is print()-ed explicitly from the callback sites.
# basicConfig is idempotent (no-op if marcus_voice installed handlers first).
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
    handlers=[
        RotatingFileHandler(
            os.path.join(LOG_DIR, "voice.log"),
            maxBytes=5_000_000, backupCount=3, encoding="utf-8",
        ),
    ],
)
log = logging.getLogger("audio_api")


# ─── AUDIO API CLASS ─────────────────────────────────────

class AudioAPI:
    """Marcus audio interface — speak + record + play."""

    def __init__(self):
        self._config = load_config("Voice")
        self._client = None
        self._sdk_available = False
        self._init_sdk()

        # Config shortcuts
        self._tts = self._config["tts"]
        self._mic = self._config["mic"]
        self._spk = self._config["speaker"]
        self._target_rate = self._tts.get("target_sample_rate", 16000)

        # Default mic backend: G1 built-in UDP multicast.
        # Set mic.backend="pactl_parec" in config_Voice.json to fall back
        # to the legacy Hollyland/PulseAudio path.
        self._mic_backend = self._mic.get("backend", "builtin_udp")
        self._builtin_mic = None    # lazy-initialized on first record()

        # Built-in TTS wrapper (uses the already-initialized AudioClient).
        # Keeps TTS synchronous so `is_speaking` is meaningful to the voice
        # loop that needs to skip mic input during playback.
        self._tts_engine = None
        if self._sdk_available:
            from Voice.builtin_tts import BuiltinTTS
            self._tts_engine = BuiltinTTS(
                self._client,
                default_speaker_id=self._tts.get("builtin_speaker_id", 0),
            )

        # Data dir
        data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"])
        os.makedirs(data_dir, exist_ok=True)
        self._data_dir = data_dir

        # Speaking lock — prevents mic from hearing TTS output
        self._speaking = False
        self._speak_lock = threading.Lock()

        log.info("%s (mic=%s, tts=%s)",
                 self._config["messages"]["ready"],
                 self._mic_backend,
                 "builtin_ttsmaker" if self._tts_engine else "disabled")

    def _init_sdk(self):
        """Initialize Unitree AudioClient."""
        try:
            from unitree_sdk2py.core.channel import ChannelFactoryInitialize
            from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient

            dds_iface = self._config["speaker"]["dds_interface"]
            ChannelFactoryInitialize(0, dds_iface)

            self._client = AudioClient()
            self._client.SetTimeout(10.0)
            self._client.Init()
            self._client.SetVolume(self._config["speaker"]["volume"])
            self._sdk_available = True
            log.info("AudioClient initialized on %s", dds_iface)
        except Exception as e:
            log.error("AudioClient init failed: %s", e)
            self._sdk_available = False

    # ─── SPEAK ────────────────────────────────────────────

    def speak(self, text: str, lang: str = "en"):
        """
        Speak `text` in English through the G1 built-in TTS (TtsMaker).

        Mutes (flushes) the mic during playback so the voice loop doesn't
        hear the robot's own voice and transcribe itself. `lang` is kept
        in the signature for API compatibility but only `"en"` is accepted
        — non-ASCII text (Arabic) is rejected by BuiltinTTS because the
        G1 firmware silently maps it to Chinese, which nobody wants.
        """
        if lang and lang != "en":
            log.warning("builtin_tts only supports English; got lang=%r — skipping", lang)
            return
        if self._tts_engine is None:
            log.error("No TTS engine initialized — audio SDK unavailable")
            return

        log.info("speak: %s", text[:80])

        with self._speak_lock:
            self._speaking = True
            self._mute_mic()
            try:
                self._tts_engine.speak(text, block=True)
            except Exception as e:
                log.error("%s: %s", self._config["messages"]["error_tts"], e)
            finally:
                # Small tail so the speaker fully finishes before the mic is
                # re-opened for capture
                time.sleep(0.2)
                self._unmute_mic()
                self._speaking = False

    def _mute_mic(self):
        """
        Suppress mic input during TTS playback.
        For the UDP built-in mic, flush the buffer so we don't capture any
        echo that's already been queued. For the legacy PulseAudio path,
        actually mute the source.
        """
        if self._mic_backend == "builtin_udp":
            if self._builtin_mic is not None:
                self._builtin_mic.flush()
            return
        source = self._mic["source_index"]
        subprocess.run(["pactl", "set-source-mute", source, "1"],
                       capture_output=True)
        log.debug("Mic muted")

    def _unmute_mic(self):
        """Re-enable mic after TTS playback (pactl path only)."""
        if self._mic_backend == "builtin_udp":
            if self._builtin_mic is not None:
                self._builtin_mic.flush()
            return
        source = self._mic["source_index"]
        subprocess.run(["pactl", "set-source-mute", source, "0"],
                       capture_output=True)
        log.debug("Mic unmuted")

    @property
    def is_speaking(self) -> bool:
        """True while TTS is playing — voice module checks this."""
        return self._speaking

    def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray:
        """Linear resample int16 PCM to self._target_rate (16 kHz)."""
        if src_rate == self._target_rate:
            return audio
        tl = int(len(audio) * self._target_rate / src_rate)
        return np.interp(
            np.linspace(0, len(audio), tl, endpoint=False),
            np.arange(len(audio)),
            audio.astype(np.float64),
        ).astype(np.int16)

    # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────

    def _play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
        """
        Play mono int16 PCM on the G1 speaker.

        `rate` is the sample rate of the incoming `audio`; we always
        resample to self._target_rate (16 kHz) before sending because the
        G1 speaker hardware only honors that rate — if you hand it 24 kHz
        PCM, it plays ~1.5x too fast. This matches the Sanad pattern.

        Uses AudioClient.PlayStream (the high-level API) with a fresh
        stream_id + STOP_PLAY bracket on either side so a prior stream
        can't blend into this one.
        """
        if not self._sdk_available:
            log.warning("SDK not available, cannot play audio")
            return 0.0

        src_rate = int(rate) if rate else self._target_rate
        audio = self._resample(audio, src_rate)   # → self._target_rate
        if audio.size == 0:
            return 0.0

        from unitree_sdk2py.g1.audio.g1_audio_api import ROBOT_API_ID_AUDIO_STOP_PLAY

        app_name = self._spk["app_name"]

        # Stop any prior stream before opening a new one.
        self._client._Call(
            ROBOT_API_ID_AUDIO_STOP_PLAY,
            json.dumps({"app_name": app_name}),
        )
        time.sleep(0.15)

        sid = f"s_{int(time.time() * 1000)}"
        self._client.PlayStream(app_name, sid, audio.tobytes())

        duration = len(audio) / self._target_rate
        # Margin covers DDS buffer drain before STOP cuts playback short.
        time.sleep(duration + 0.3)

        self._client._Call(
            ROBOT_API_ID_AUDIO_STOP_PLAY,
            json.dumps({"app_name": app_name}),
        )

        log.info("Played: %.1fs (src=%d Hz → hw=%d Hz)",
                 duration, src_rate, self._target_rate)
        return duration

    def play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
        """Public wrapper for playing PCM audio."""
        return self._play_pcm(audio, rate=rate)

    # ─── MIC RECORDING ───────────────────────────────────

    def record(self, seconds: float = 5.0) -> np.ndarray:
        """
        Capture `seconds` of int16 mono 16 kHz PCM.

        Default backend is the G1 built-in mic (UDP multicast). Set
        mic.backend="pactl_parec" in config_Voice.json to use the
        legacy Hollyland/parec path instead.
        """
        if self._mic_backend == "builtin_udp":
            return self._record_builtin(seconds)
        return self._record_parec(seconds)

    def _record_builtin(self, seconds: float) -> np.ndarray:
        """Built-in mic path — join UDP multicast, read the requested duration."""
        if self._builtin_mic is None:
            from Voice.builtin_mic import BuiltinMic
            mcfg = self._config.get("mic_udp", {})
            self._builtin_mic = BuiltinMic(
                group=mcfg.get("group", "239.168.123.161"),
                port=mcfg.get("port", 5555),
                buf_max=mcfg.get("buffer_max_bytes", 64000),
            )
            self._builtin_mic.start()
            time.sleep(0.2)  # let the receiver thread fill in

        log.info("Recording %.1fs from G1 built-in mic", seconds)
        raw = self._builtin_mic.read_seconds(seconds)
        audio = np.frombuffer(raw, dtype=np.int16)
        log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
        if audio.std() < 50:
            log.warning(self._config["messages"]["error_mic"] +
                        " — G1 mic silent (check audio service on robot)")
        return audio

    def _record_parec(self, seconds: float) -> np.ndarray:
        """Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'."""
        source = self._mic["source_index"]
        rate = str(self._mic["rate"])
        channels = str(self._mic["channels"])
        fmt = self._mic["format"]

        subprocess.run(["pactl", "set-source-mute",  source, "0"],   capture_output=True)
        subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True)

        log.info("Recording %.1fs from mic source %s (parec)", seconds, source)
        proc = None
        raw = b""
        try:
            proc = subprocess.Popen(
                ["parec", "-d", source,
                 f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"],
                stdout=subprocess.PIPE,
            )
            time.sleep(seconds)
        finally:
            # Always kill parec — an exception in time.sleep (Ctrl-C / signal)
            # would otherwise leave an orphaned recorder process running.
            if proc is not None:
                try:
                    proc.terminate()
                    raw = proc.stdout.read()
                    proc.wait(timeout=1.0)
                except Exception as e:
                    log.warning("parec cleanup error: %s", e)
                    # Last-resort SIGKILL — suppress only OSError (process
                    # already exited) so we don't mask other bugs.
                    try:
                        proc.kill()
                    except OSError:
                        pass

        audio = np.frombuffer(raw, dtype=np.int16)
        log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
        if audio.std() < 50:
            log.warning(self._config["messages"]["error_mic"] + " — mic may be silent")
        return audio

    def save_recording(self, audio: np.ndarray, name: str) -> str:
        """Save recording to Data/Voice/Recordings/."""
        path = os.path.join(self._data_dir, f"{name}.wav")
        wf = wave.open(path, "wb")
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(self._target_rate)
        wf.writeframes(audio.tobytes())
        wf.close()
        log.info("Saved: %s", path)
        return path

    # ─── STATUS ───────────────────────────────────────────

    @property
    def is_available(self) -> bool:
        return self._sdk_available


# ─── STANDALONE TEST ─────────────────────────────────────

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Marcus Audio API Test")
    parser.add_argument("--test", action="store_true", help="Run TTS + record test")
    parser.add_argument("--speak", type=str, help="Speak this English text")
    parser.add_argument("--record", type=float, default=0, help="Record N seconds")
    args = parser.parse_args()

    api = AudioAPI()

    if args.test:
        print("\n--- English (TtsMaker) ---")
        api.speak("Hello, I am Sanad.")
        time.sleep(1)

        print("\n--- Record 3s + playback ---")
        rec = api.record(3.0)
        if rec.std() > 50:
            api.play_pcm(rec)
        print("\nDone.")

    elif args.speak:
        api.speak(args.speak)

    elif args.record > 0:
        rec = api.record(args.record)
        api.save_recording(rec, f"test_{int(time.time())}")
        if rec.std() > 50:
            api.play_pcm(rec)
    else:
        parser.print_help()