#!/usr/bin/env python3 """ API/audio_api.py — Marcus Audio API Layer ========================================== Provides speak() and record() for the Brain layer. Brain imports ONLY from this API — never from unitree SDK directly. Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only, no MP3/WAV plumbing, no internet). Optional raw-PCM playback path via _play_pcm() is kept for future modules that synthesize their own audio (e.g. offline Piper). Mic: G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono). Legacy Hollyland/parec path retained as fallback when config_Voice.json has mic.backend="pactl_parec". TTS: English only. Arabic is rejected (the G1 firmware silently maps Arabic to Chinese, which confuses everyone — if Arabic TTS is ever needed again, use a separate offline backend like Piper). Usage: from API.audio_api import AudioAPI audio = AudioAPI() audio.speak("Hello, I am Sanad") recording = audio.record(seconds=5) audio.play_pcm(recording) """ import json import logging import os import subprocess import sys import threading import time import wave import numpy as np # ─── PATH + CONFIG ─────────────────────────────────────── # Use the canonical loaders from Core/ so path + config logic lives in one place. _PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _PROJECT_DIR not in sys.path: sys.path.insert(0, _PROJECT_DIR) from Core.env_loader import PROJECT_ROOT from Core.config_loader import load_config LOG_DIR = os.path.join(PROJECT_ROOT, "logs") os.makedirs(LOG_DIR, exist_ok=True) # Note: logging.basicConfig() only takes effect on the first call per process. # If the voice module already configured logging (common path via run_marcus.py), # this call is a no-op. When audio_api is used standalone, it wires logs to # logs/voice.log + stderr. logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", handlers=[ logging.FileHandler(os.path.join(LOG_DIR, "voice.log")), logging.StreamHandler(), ], ) log = logging.getLogger("audio_api") # ─── AUDIO API CLASS ───────────────────────────────────── class AudioAPI: """Marcus audio interface — speak + record + play.""" def __init__(self): self._config = load_config("Voice") self._client = None self._sdk_available = False self._init_sdk() # Config shortcuts self._tts = self._config["tts"] self._mic = self._config["mic"] self._spk = self._config["speaker"] self._target_rate = self._tts.get("target_sample_rate", 16000) # Default mic backend: G1 built-in UDP multicast. # Set mic.backend="pactl_parec" in config_Voice.json to fall back # to the legacy Hollyland/PulseAudio path. self._mic_backend = self._mic.get("backend", "builtin_udp") self._builtin_mic = None # lazy-initialized on first record() # Built-in TTS wrapper (uses the already-initialized AudioClient). # Keeps TTS synchronous so `is_speaking` is meaningful to the voice # loop that needs to skip mic input during playback. self._tts_engine = None if self._sdk_available: from Voice.builtin_tts import BuiltinTTS self._tts_engine = BuiltinTTS( self._client, default_speaker_id=self._tts.get("builtin_speaker_id", 0), ) # Data dir data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"]) os.makedirs(data_dir, exist_ok=True) self._data_dir = data_dir # Speaking lock — prevents mic from hearing TTS output self._speaking = False self._speak_lock = threading.Lock() log.info("%s (mic=%s, tts=%s)", self._config["messages"]["ready"], self._mic_backend, "builtin_ttsmaker" if self._tts_engine else "disabled") def _init_sdk(self): """Initialize Unitree AudioClient.""" try: from unitree_sdk2py.core.channel import ChannelFactoryInitialize from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient dds_iface = self._config["speaker"]["dds_interface"] ChannelFactoryInitialize(0, dds_iface) self._client = AudioClient() self._client.SetTimeout(10.0) self._client.Init() self._client.SetVolume(self._config["speaker"]["volume"]) self._sdk_available = True log.info("AudioClient initialized on %s", dds_iface) except Exception as e: log.error("AudioClient init failed: %s", e) self._sdk_available = False # ─── SPEAK ──────────────────────────────────────────── def speak(self, text: str, lang: str = "en"): """ Speak `text` in English through the G1 built-in TTS (TtsMaker). Mutes (flushes) the mic during playback so the voice loop doesn't hear the robot's own voice and transcribe itself. `lang` is kept in the signature for API compatibility but only `"en"` is accepted — non-ASCII text (Arabic) is rejected by BuiltinTTS because the G1 firmware silently maps it to Chinese, which nobody wants. """ if lang and lang != "en": log.warning("builtin_tts only supports English; got lang=%r — skipping", lang) return if self._tts_engine is None: log.error("No TTS engine initialized — audio SDK unavailable") return log.info("speak: %s", text[:80]) with self._speak_lock: self._speaking = True self._mute_mic() try: self._tts_engine.speak(text, block=True) except Exception as e: log.error("%s: %s", self._config["messages"]["error_tts"], e) finally: # Small tail so the speaker fully finishes before the mic is # re-opened for capture time.sleep(0.2) self._unmute_mic() self._speaking = False def _mute_mic(self): """ Suppress mic input during TTS playback. For the UDP built-in mic, flush the buffer so we don't capture any echo that's already been queued. For the legacy PulseAudio path, actually mute the source. """ if self._mic_backend == "builtin_udp": if self._builtin_mic is not None: self._builtin_mic.flush() return source = self._mic["source_index"] subprocess.run(["pactl", "set-source-mute", source, "1"], capture_output=True) log.debug("Mic muted") def _unmute_mic(self): """Re-enable mic after TTS playback (pactl path only).""" if self._mic_backend == "builtin_udp": if self._builtin_mic is not None: self._builtin_mic.flush() return source = self._mic["source_index"] subprocess.run(["pactl", "set-source-mute", source, "0"], capture_output=True) log.debug("Mic unmuted") @property def is_speaking(self) -> bool: """True while TTS is playing — voice module checks this.""" return self._speaking def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray: """Linear resample int16 PCM to self._target_rate (16 kHz).""" if src_rate == self._target_rate: return audio tl = int(len(audio) * self._target_rate / src_rate) return np.interp( np.linspace(0, len(audio), tl, endpoint=False), np.arange(len(audio)), audio.astype(np.float64), ).astype(np.int16) # ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ───────── def _play_pcm(self, audio_16k: np.ndarray) -> float: """Play 16kHz mono int16 on G1 speaker. Returns duration.""" if not self._sdk_available: log.warning("SDK not available, cannot play audio") return 0.0 from unitree_sdk2py.g1.audio.g1_audio_api import ( ROBOT_API_ID_AUDIO_START_PLAY, ROBOT_API_ID_AUDIO_STOP_PLAY, ) app_name = self._spk["app_name"] # Stop previous stream self._client._Call( ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}), ) time.sleep(0.3) # Build params — unique stream_id every call pcm = audio_16k.tobytes() sid = f"s_{int(time.time() * 1000)}" param = json.dumps({ "app_name": app_name, "stream_id": sid, "sample_rate": self._target_rate, "channels": 1, "bits_per_sample": 16, }) # Single call — full buffer self._client._CallRequestWithParamAndBin( ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm) ) duration = len(audio_16k) / self._target_rate time.sleep(duration + 0.5) self._client._Call( ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}), ) log.info("Played: %.1fs", duration) return duration def play_pcm(self, audio_16k: np.ndarray) -> float: """Public wrapper for playing PCM audio.""" return self._play_pcm(audio_16k) # ─── MIC RECORDING ─────────────────────────────────── def record(self, seconds: float = 5.0) -> np.ndarray: """ Capture `seconds` of int16 mono 16 kHz PCM. Default backend is the G1 built-in mic (UDP multicast). Set mic.backend="pactl_parec" in config_Voice.json to use the legacy Hollyland/parec path instead. """ if self._mic_backend == "builtin_udp": return self._record_builtin(seconds) return self._record_parec(seconds) def _record_builtin(self, seconds: float) -> np.ndarray: """Built-in mic path — join UDP multicast, read the requested duration.""" if self._builtin_mic is None: from Voice.builtin_mic import BuiltinMic mcfg = self._config.get("mic_udp", {}) self._builtin_mic = BuiltinMic( group=mcfg.get("group", "239.168.123.161"), port=mcfg.get("port", 5555), buf_max=mcfg.get("buffer_max_bytes", 64000), ) self._builtin_mic.start() time.sleep(0.2) # let the receiver thread fill in log.info("Recording %.1fs from G1 built-in mic", seconds) raw = self._builtin_mic.read_seconds(seconds) audio = np.frombuffer(raw, dtype=np.int16) log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std()) if audio.std() < 50: log.warning(self._config["messages"]["error_mic"] + " — G1 mic silent (check audio service on robot)") return audio def _record_parec(self, seconds: float) -> np.ndarray: """Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'.""" source = self._mic["source_index"] rate = str(self._mic["rate"]) channels = str(self._mic["channels"]) fmt = self._mic["format"] subprocess.run(["pactl", "set-source-mute", source, "0"], capture_output=True) subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True) log.info("Recording %.1fs from mic source %s (parec)", seconds, source) proc = subprocess.Popen( ["parec", "-d", source, f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"], stdout=subprocess.PIPE, ) time.sleep(seconds) proc.terminate() raw = proc.stdout.read() audio = np.frombuffer(raw, dtype=np.int16) log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std()) if audio.std() < 50: log.warning(self._config["messages"]["error_mic"] + " — mic may be silent") return audio def save_recording(self, audio: np.ndarray, name: str) -> str: """Save recording to Data/Voice/Recordings/.""" path = os.path.join(self._data_dir, f"{name}.wav") wf = wave.open(path, "wb") wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(self._target_rate) wf.writeframes(audio.tobytes()) wf.close() log.info("Saved: %s", path) return path # ─── STATUS ─────────────────────────────────────────── @property def is_available(self) -> bool: return self._sdk_available # ─── STANDALONE TEST ───────────────────────────────────── if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Marcus Audio API Test") parser.add_argument("--test", action="store_true", help="Run TTS + record test") parser.add_argument("--speak", type=str, help="Speak this English text") parser.add_argument("--record", type=float, default=0, help="Record N seconds") args = parser.parse_args() api = AudioAPI() if args.test: print("\n--- English (TtsMaker) ---") api.speak("Hello, I am Sanad.") time.sleep(1) print("\n--- Record 3s + playback ---") rec = api.record(3.0) if rec.std() > 50: api.play_pcm(rec) print("\nDone.") elif args.speak: api.speak(args.speak) elif args.record > 0: rec = api.record(args.record) api.save_recording(rec, f"test_{int(time.time())}") if rec.std() > 50: api.play_pcm(rec) else: parser.print_help()