#!/usr/bin/env python3 """ API/audio_api.py — Marcus Audio API Layer ========================================== Provides speak() and record_audio() for the Brain layer. Brain imports ONLY from this API — never from unitree SDK directly. Speaker: _CallRequestWithParamAndBin (single call, full buffer) Mic: parec -d 3 (Hollyland wireless, PulseAudio source index from config) TTS EN: Unitree built-in TtsMaker TTS AR: Piper ar_JO-kareem-medium → resample → G1 speaker Usage: from API.audio_api import AudioAPI audio = AudioAPI() audio.speak("Hello", "en") audio.speak("مرحبا", "ar") recording = audio.record(seconds=5) audio.play_pcm(recording) """ import json import logging import os import subprocess import threading import time import wave import numpy as np # ─── PATH CONFIG ───────────────────────────────────────── from dotenv import load_dotenv load_dotenv() BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree") PROJECT_NAME = "Marcus" PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME) LOG_DIR = os.path.join(PROJECT_ROOT, "logs") os.makedirs(LOG_DIR, exist_ok=True) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", handlers=[ logging.FileHandler(os.path.join(LOG_DIR, "voice.log")), logging.StreamHandler(), ], ) log = logging.getLogger("audio_api") def load_config(name: str) -> dict: path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json") with open(path, "r") as f: return json.load(f) # ─── AUDIO API CLASS ───────────────────────────────────── class AudioAPI: """Marcus audio interface — speak + record + play.""" def __init__(self): self._config = load_config("Voice") self._client = None self._sdk_available = False self._init_sdk() # Config shortcuts self._tts = self._config["tts"] self._mic = self._config["mic"] self._spk = self._config["speaker"] self._target_rate = self._tts["target_sample_rate"] # Data dir data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"]) os.makedirs(data_dir, exist_ok=True) self._data_dir = data_dir # Speaking lock — prevents mic from hearing TTS output self._speaking = False self._speak_lock = threading.Lock() log.info(self._config["messages"]["ready"]) def _init_sdk(self): """Initialize Unitree AudioClient.""" try: from unitree_sdk2py.core.channel import ChannelFactoryInitialize from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient dds_iface = self._config["speaker"]["dds_interface"] ChannelFactoryInitialize(0, dds_iface) self._client = AudioClient() self._client.SetTimeout(10.0) self._client.Init() self._client.SetVolume(self._config["speaker"]["volume"]) self._sdk_available = True log.info("AudioClient initialized on %s", dds_iface) except Exception as e: log.error("AudioClient init failed: %s", e) self._sdk_available = False # ─── SPEAK ──────────────────────────────────────────── def speak(self, text: str, lang: str = "auto"): """ Speak text in the given language. Mutes mic during playback to prevent self-listening. lang="en" → built-in TtsMaker lang="ar" → Piper → resample → G1 speaker lang="auto" → detect from text """ if lang == "auto": lang = self._detect_lang(text) log.info("[%s] speak: %s", lang.upper(), text[:80]) with self._speak_lock: self._speaking = True self._mute_mic() try: if lang == "en": self._speak_english(text) elif lang == "ar": self._speak_arabic(text) else: log.warning("Unknown lang '%s', falling back to English", lang) self._speak_english(text) except Exception as e: log.error("%s: %s", self._config["messages"]["error_tts"], e) finally: # Small delay so speaker fully stops before mic reopens time.sleep(0.3) self._unmute_mic() self._speaking = False def _mute_mic(self): """Mute the wireless mic to prevent self-listening.""" source = self._mic["source_index"] subprocess.run( ["pactl", "set-source-mute", source, "1"], capture_output=True, ) log.debug("Mic muted") def _unmute_mic(self): """Unmute the wireless mic.""" source = self._mic["source_index"] subprocess.run( ["pactl", "set-source-mute", source, "0"], capture_output=True, ) log.debug("Mic unmuted") @property def is_speaking(self) -> bool: """True while TTS is playing — voice module checks this.""" return self._speaking def _speak_english(self, text: str): """English TTS via edge-tts.""" self._speak_edge_tts(text, "en") def _speak_arabic(self, text: str): """Arabic TTS via edge-tts.""" self._speak_edge_tts(text, "ar") def speak_piper_en(self, text: str): """Alternative: English via Piper instead of built-in.""" voice = self._tts["piper_voice_en"] audio, rate = self._piper_synthesize(text, voice) audio_16k = self._resample(audio, rate) self._play_pcm(audio_16k) # ─── PIPER TTS ──────────────────────────────────────── def _piper_synthesize(self, text: str, voice: str) -> tuple: """Run Piper CLI, return (audio_int16, sample_rate).""" cmd = ["piper", "--model", voice, "--output_raw"] timeout = self._tts["piper_timeout_sec"] proc = subprocess.run( cmd, input=text.encode("utf-8"), capture_output=True, timeout=timeout, ) if proc.returncode != 0: stderr = proc.stderr.decode()[:300] raise RuntimeError(f"Piper failed: {stderr}") audio = np.frombuffer(proc.stdout, dtype=np.int16) piper_rate = self._tts["piper_sample_rate"] log.info("Piper: %d samples @ %dHz (%.1fs)", len(audio), piper_rate, len(audio) / piper_rate) return audio, piper_rate # ─── RESAMPLE ───────────────────────────────────────── def _speak_edge_tts(self, text: str, lang: str): """Generate speech via edge-tts and play on G1.""" import os as _os voice = "ar-AE-HamdanNeural" if lang == "ar" else "en-US-GuyNeural" ts = int(time.time() * 1000) mp3_path = f"/tmp/edge_{lang}_{ts}.mp3" wav_path = f"/tmp/edge_{lang}_{ts}.wav" safe_text = text.replace('"', '\\"') code = f'import edge_tts, asyncio; asyncio.run(edge_tts.Communicate(\"{safe_text}\", voice=\"{voice}\").save(\"{mp3_path}\"))' result = subprocess.run(["python3", "-c", code], capture_output=True, text=True, timeout=30) if result.returncode != 0: log.error("edge-tts failed: %s", result.stderr[:200]) if lang == "en" and self._sdk_available: self._client.TtsMaker(text, self._tts.get("builtin_speaker_id", 1)) time.sleep(max(2.0, len(text) * 0.06)) return try: from pydub import AudioSegment a = AudioSegment.from_mp3(mp3_path) a = a.set_frame_rate(16000).set_channels(1).set_sample_width(2) a.export(wav_path, format="wav") import wave with wave.open(wav_path, "rb") as wf: audio = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16) _os.unlink(mp3_path) _os.unlink(wav_path) self._play_pcm(audio) except Exception as e: log.error("edge-tts conversion error: %s", e) try: _os.unlink(mp3_path) except: pass try: _os.unlink(wav_path) except: pass def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray: """Resample to target rate (16kHz).""" if src_rate == self._target_rate: return audio tl = int(len(audio) * self._target_rate / src_rate) return np.interp( np.linspace(0, len(audio), tl, endpoint=False), np.arange(len(audio)), audio.astype(np.float64), ).astype(np.int16) # ─── G1 SPEAKER PLAYBACK ───────────────────────────── def _play_pcm(self, audio_16k: np.ndarray) -> float: """Play 16kHz mono int16 on G1 speaker. Returns duration.""" if not self._sdk_available: log.warning("SDK not available, cannot play audio") return 0.0 from unitree_sdk2py.g1.audio.g1_audio_api import ( ROBOT_API_ID_AUDIO_START_PLAY, ROBOT_API_ID_AUDIO_STOP_PLAY, ) app_name = self._spk["app_name"] # Stop previous stream self._client._Call( ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}), ) time.sleep(0.3) # Build params — unique stream_id every call pcm = audio_16k.tobytes() sid = f"s_{int(time.time() * 1000)}" param = json.dumps({ "app_name": app_name, "stream_id": sid, "sample_rate": self._target_rate, "channels": 1, "bits_per_sample": 16, }) # Single call — full buffer self._client._CallRequestWithParamAndBin( ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm) ) duration = len(audio_16k) / self._target_rate time.sleep(duration + 0.5) self._client._Call( ROBOT_API_ID_AUDIO_STOP_PLAY, json.dumps({"app_name": app_name}), ) log.info("Played: %.1fs", duration) return duration def play_pcm(self, audio_16k: np.ndarray) -> float: """Public wrapper for playing PCM audio.""" return self._play_pcm(audio_16k) # ─── MIC RECORDING ─────────────────────────────────── def record(self, seconds: float = 5.0) -> np.ndarray: """Record from Hollyland wireless mic via parec. Returns int16 array.""" source = self._mic["source_index"] rate = str(self._mic["rate"]) channels = str(self._mic["channels"]) fmt = self._mic["format"] # Unmute mic subprocess.run( ["pactl", "set-source-mute", source, "0"], capture_output=True, ) subprocess.run( ["pactl", "set-source-volume", source, "100%"], capture_output=True, ) log.info("Recording %.1fs from mic source %s", seconds, source) proc = subprocess.Popen( ["parec", "-d", source, f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"], stdout=subprocess.PIPE, ) time.sleep(seconds) proc.terminate() raw = proc.stdout.read() audio = np.frombuffer(raw, dtype=np.int16) log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std()) if audio.std() < 50: log.warning(self._config["messages"]["error_mic"] + " — mic may be silent") return audio def save_recording(self, audio: np.ndarray, name: str) -> str: """Save recording to Data/Voice/Recordings/.""" path = os.path.join(self._data_dir, f"{name}.wav") wf = wave.open(path, "wb") wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(self._target_rate) wf.writeframes(audio.tobytes()) wf.close() log.info("Saved: %s", path) return path # ─── LANGUAGE DETECTION ─────────────────────────────── @staticmethod def _detect_lang(text: str) -> str: """Detect language from text — Arabic Unicode range check.""" for c in text: if '\u0600' <= c <= '\u06FF': return "ar" return "en" # ─── STATUS ─────────────────────────────────────────── @property def is_available(self) -> bool: return self._sdk_available # ─── STANDALONE TEST ───────────────────────────────────── if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Marcus Audio API Test") parser.add_argument("--test", action="store_true", help="Run speak tests") parser.add_argument("--speak", type=str, help="Speak this text") parser.add_argument("--lang", default="auto", help="Language: en, ar, auto") parser.add_argument("--record", type=float, default=0, help="Record N seconds") args = parser.parse_args() api = AudioAPI() if args.test: print("\n--- English built-in ---") api.speak("Hello, I am Marcus.", "en") time.sleep(1) print("\n--- Arabic Piper ---") api.speak("مرحبا، أنا ماركوس", "ar") time.sleep(1) print("\n--- Auto-detect ---") api.speak("How are you?") time.sleep(1) api.speak("كيف حالك؟") time.sleep(1) print("\n--- Record 3s + playback ---") rec = api.record(3.0) if rec.std() > 50: api.play_pcm(rec) print("\nDone.") elif args.speak: api.speak(args.speak, args.lang) elif args.record > 0: rec = api.record(args.record) api.save_recording(rec, f"test_{int(time.time())}") if rec.std() > 50: api.play_pcm(rec) else: parser.print_help()