420 lines
14 KiB
Python
420 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
API/audio_api.py — Marcus Audio API Layer
|
|
==========================================
|
|
Provides speak() and record_audio() for the Brain layer.
|
|
Brain imports ONLY from this API — never from unitree SDK directly.
|
|
|
|
Speaker: _CallRequestWithParamAndBin (single call, full buffer)
|
|
Mic: parec -d 3 (Hollyland wireless, PulseAudio source index from config)
|
|
TTS EN: Unitree built-in TtsMaker
|
|
TTS AR: Piper ar_JO-kareem-medium → resample → G1 speaker
|
|
|
|
Usage:
|
|
from API.audio_api import AudioAPI
|
|
audio = AudioAPI()
|
|
audio.speak("Hello", "en")
|
|
audio.speak("مرحبا", "ar")
|
|
recording = audio.record(seconds=5)
|
|
audio.play_pcm(recording)
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import wave
|
|
import numpy as np
|
|
|
|
# ─── PATH CONFIG ─────────────────────────────────────────
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
|
|
PROJECT_NAME = "Marcus"
|
|
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
|
|
|
|
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
|
handlers=[
|
|
logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
|
|
logging.StreamHandler(),
|
|
],
|
|
)
|
|
log = logging.getLogger("audio_api")
|
|
|
|
|
|
def load_config(name: str) -> dict:
|
|
path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
|
|
with open(path, "r") as f:
|
|
return json.load(f)
|
|
|
|
|
|
# ─── AUDIO API CLASS ─────────────────────────────────────
|
|
|
|
class AudioAPI:
|
|
"""Marcus audio interface — speak + record + play."""
|
|
|
|
def __init__(self):
|
|
self._config = load_config("Voice")
|
|
self._client = None
|
|
self._sdk_available = False
|
|
self._init_sdk()
|
|
|
|
# Config shortcuts
|
|
self._tts = self._config["tts"]
|
|
self._mic = self._config["mic"]
|
|
self._spk = self._config["speaker"]
|
|
self._target_rate = self._tts["target_sample_rate"]
|
|
|
|
# Data dir
|
|
data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"])
|
|
os.makedirs(data_dir, exist_ok=True)
|
|
self._data_dir = data_dir
|
|
|
|
# Speaking lock — prevents mic from hearing TTS output
|
|
self._speaking = False
|
|
self._speak_lock = threading.Lock()
|
|
|
|
log.info(self._config["messages"]["ready"])
|
|
|
|
def _init_sdk(self):
|
|
"""Initialize Unitree AudioClient."""
|
|
try:
|
|
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
|
|
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
|
|
|
|
dds_iface = self._config["speaker"]["dds_interface"]
|
|
ChannelFactoryInitialize(0, dds_iface)
|
|
|
|
self._client = AudioClient()
|
|
self._client.SetTimeout(10.0)
|
|
self._client.Init()
|
|
self._client.SetVolume(self._config["speaker"]["volume"])
|
|
self._sdk_available = True
|
|
log.info("AudioClient initialized on %s", dds_iface)
|
|
except Exception as e:
|
|
log.error("AudioClient init failed: %s", e)
|
|
self._sdk_available = False
|
|
|
|
# ─── SPEAK ────────────────────────────────────────────
|
|
|
|
def speak(self, text: str, lang: str = "auto"):
|
|
"""
|
|
Speak text in the given language.
|
|
Mutes mic during playback to prevent self-listening.
|
|
lang="en" → built-in TtsMaker
|
|
lang="ar" → Piper → resample → G1 speaker
|
|
lang="auto" → detect from text
|
|
"""
|
|
if lang == "auto":
|
|
lang = self._detect_lang(text)
|
|
|
|
log.info("[%s] speak: %s", lang.upper(), text[:80])
|
|
|
|
with self._speak_lock:
|
|
self._speaking = True
|
|
self._mute_mic()
|
|
|
|
try:
|
|
if lang == "en":
|
|
self._speak_english(text)
|
|
elif lang == "ar":
|
|
self._speak_arabic(text)
|
|
else:
|
|
log.warning("Unknown lang '%s', falling back to English", lang)
|
|
self._speak_english(text)
|
|
except Exception as e:
|
|
log.error("%s: %s", self._config["messages"]["error_tts"], e)
|
|
finally:
|
|
# Small delay so speaker fully stops before mic reopens
|
|
time.sleep(0.3)
|
|
self._unmute_mic()
|
|
self._speaking = False
|
|
|
|
def _mute_mic(self):
|
|
"""Mute the wireless mic to prevent self-listening."""
|
|
source = self._mic["source_index"]
|
|
subprocess.run(
|
|
["pactl", "set-source-mute", source, "1"],
|
|
capture_output=True,
|
|
)
|
|
log.debug("Mic muted")
|
|
|
|
def _unmute_mic(self):
|
|
"""Unmute the wireless mic."""
|
|
source = self._mic["source_index"]
|
|
subprocess.run(
|
|
["pactl", "set-source-mute", source, "0"],
|
|
capture_output=True,
|
|
)
|
|
log.debug("Mic unmuted")
|
|
|
|
@property
|
|
def is_speaking(self) -> bool:
|
|
"""True while TTS is playing — voice module checks this."""
|
|
return self._speaking
|
|
|
|
def _speak_english(self, text: str):
|
|
"""English TTS via edge-tts."""
|
|
self._speak_edge_tts(text, "en")
|
|
|
|
def _speak_arabic(self, text: str):
|
|
"""Arabic TTS via edge-tts."""
|
|
self._speak_edge_tts(text, "ar")
|
|
|
|
def speak_piper_en(self, text: str):
|
|
"""Alternative: English via Piper instead of built-in."""
|
|
voice = self._tts["piper_voice_en"]
|
|
audio, rate = self._piper_synthesize(text, voice)
|
|
audio_16k = self._resample(audio, rate)
|
|
self._play_pcm(audio_16k)
|
|
|
|
# ─── PIPER TTS ────────────────────────────────────────
|
|
|
|
def _piper_synthesize(self, text: str, voice: str) -> tuple:
|
|
"""Run Piper CLI, return (audio_int16, sample_rate)."""
|
|
cmd = ["piper", "--model", voice, "--output_raw"]
|
|
timeout = self._tts["piper_timeout_sec"]
|
|
|
|
proc = subprocess.run(
|
|
cmd,
|
|
input=text.encode("utf-8"),
|
|
capture_output=True,
|
|
timeout=timeout,
|
|
)
|
|
|
|
if proc.returncode != 0:
|
|
stderr = proc.stderr.decode()[:300]
|
|
raise RuntimeError(f"Piper failed: {stderr}")
|
|
|
|
audio = np.frombuffer(proc.stdout, dtype=np.int16)
|
|
piper_rate = self._tts["piper_sample_rate"]
|
|
log.info("Piper: %d samples @ %dHz (%.1fs)", len(audio), piper_rate, len(audio) / piper_rate)
|
|
return audio, piper_rate
|
|
|
|
# ─── RESAMPLE ─────────────────────────────────────────
|
|
|
|
|
|
def _speak_edge_tts(self, text: str, lang: str):
|
|
"""Generate speech via edge-tts and play on G1."""
|
|
import os as _os
|
|
voice = "ar-AE-HamdanNeural" if lang == "ar" else "en-US-GuyNeural"
|
|
ts = int(time.time() * 1000)
|
|
mp3_path = f"/tmp/edge_{lang}_{ts}.mp3"
|
|
wav_path = f"/tmp/edge_{lang}_{ts}.wav"
|
|
|
|
safe_text = text.replace('"', '\\"')
|
|
code = f'import edge_tts, asyncio; asyncio.run(edge_tts.Communicate(\"{safe_text}\", voice=\"{voice}\").save(\"{mp3_path}\"))'
|
|
result = subprocess.run(["python3", "-c", code], capture_output=True, text=True, timeout=30)
|
|
|
|
if result.returncode != 0:
|
|
log.error("edge-tts failed: %s", result.stderr[:200])
|
|
if lang == "en" and self._sdk_available:
|
|
self._client.TtsMaker(text, self._tts.get("builtin_speaker_id", 1))
|
|
time.sleep(max(2.0, len(text) * 0.06))
|
|
return
|
|
|
|
try:
|
|
from pydub import AudioSegment
|
|
a = AudioSegment.from_mp3(mp3_path)
|
|
a = a.set_frame_rate(16000).set_channels(1).set_sample_width(2)
|
|
a.export(wav_path, format="wav")
|
|
|
|
import wave
|
|
with wave.open(wav_path, "rb") as wf:
|
|
audio = np.frombuffer(wf.readframes(wf.getnframes()), dtype=np.int16)
|
|
|
|
_os.unlink(mp3_path)
|
|
_os.unlink(wav_path)
|
|
self._play_pcm(audio)
|
|
except Exception as e:
|
|
log.error("edge-tts conversion error: %s", e)
|
|
try: _os.unlink(mp3_path)
|
|
except: pass
|
|
try: _os.unlink(wav_path)
|
|
except: pass
|
|
|
|
def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray:
|
|
"""Resample to target rate (16kHz)."""
|
|
if src_rate == self._target_rate:
|
|
return audio
|
|
tl = int(len(audio) * self._target_rate / src_rate)
|
|
return np.interp(
|
|
np.linspace(0, len(audio), tl, endpoint=False),
|
|
np.arange(len(audio)),
|
|
audio.astype(np.float64),
|
|
).astype(np.int16)
|
|
|
|
# ─── G1 SPEAKER PLAYBACK ─────────────────────────────
|
|
|
|
def _play_pcm(self, audio_16k: np.ndarray) -> float:
|
|
"""Play 16kHz mono int16 on G1 speaker. Returns duration."""
|
|
if not self._sdk_available:
|
|
log.warning("SDK not available, cannot play audio")
|
|
return 0.0
|
|
|
|
from unitree_sdk2py.g1.audio.g1_audio_api import (
|
|
ROBOT_API_ID_AUDIO_START_PLAY,
|
|
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
|
)
|
|
|
|
app_name = self._spk["app_name"]
|
|
|
|
# Stop previous stream
|
|
self._client._Call(
|
|
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
|
json.dumps({"app_name": app_name}),
|
|
)
|
|
time.sleep(0.3)
|
|
|
|
# Build params — unique stream_id every call
|
|
pcm = audio_16k.tobytes()
|
|
sid = f"s_{int(time.time() * 1000)}"
|
|
param = json.dumps({
|
|
"app_name": app_name,
|
|
"stream_id": sid,
|
|
"sample_rate": self._target_rate,
|
|
"channels": 1,
|
|
"bits_per_sample": 16,
|
|
})
|
|
|
|
# Single call — full buffer
|
|
self._client._CallRequestWithParamAndBin(
|
|
ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm)
|
|
)
|
|
|
|
duration = len(audio_16k) / self._target_rate
|
|
time.sleep(duration + 0.5)
|
|
|
|
self._client._Call(
|
|
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
|
json.dumps({"app_name": app_name}),
|
|
)
|
|
|
|
log.info("Played: %.1fs", duration)
|
|
return duration
|
|
|
|
def play_pcm(self, audio_16k: np.ndarray) -> float:
|
|
"""Public wrapper for playing PCM audio."""
|
|
return self._play_pcm(audio_16k)
|
|
|
|
# ─── MIC RECORDING ───────────────────────────────────
|
|
|
|
def record(self, seconds: float = 5.0) -> np.ndarray:
|
|
"""Record from Hollyland wireless mic via parec. Returns int16 array."""
|
|
source = self._mic["source_index"]
|
|
rate = str(self._mic["rate"])
|
|
channels = str(self._mic["channels"])
|
|
fmt = self._mic["format"]
|
|
|
|
# Unmute mic
|
|
subprocess.run(
|
|
["pactl", "set-source-mute", source, "0"],
|
|
capture_output=True,
|
|
)
|
|
subprocess.run(
|
|
["pactl", "set-source-volume", source, "100%"],
|
|
capture_output=True,
|
|
)
|
|
|
|
log.info("Recording %.1fs from mic source %s", seconds, source)
|
|
|
|
proc = subprocess.Popen(
|
|
["parec", "-d", source,
|
|
f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"],
|
|
stdout=subprocess.PIPE,
|
|
)
|
|
time.sleep(seconds)
|
|
proc.terminate()
|
|
raw = proc.stdout.read()
|
|
|
|
audio = np.frombuffer(raw, dtype=np.int16)
|
|
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
|
|
|
|
if audio.std() < 50:
|
|
log.warning(self._config["messages"]["error_mic"] + " — mic may be silent")
|
|
|
|
return audio
|
|
|
|
def save_recording(self, audio: np.ndarray, name: str) -> str:
|
|
"""Save recording to Data/Voice/Recordings/."""
|
|
path = os.path.join(self._data_dir, f"{name}.wav")
|
|
wf = wave.open(path, "wb")
|
|
wf.setnchannels(1)
|
|
wf.setsampwidth(2)
|
|
wf.setframerate(self._target_rate)
|
|
wf.writeframes(audio.tobytes())
|
|
wf.close()
|
|
log.info("Saved: %s", path)
|
|
return path
|
|
|
|
# ─── LANGUAGE DETECTION ───────────────────────────────
|
|
|
|
@staticmethod
|
|
def _detect_lang(text: str) -> str:
|
|
"""Detect language from text — Arabic Unicode range check."""
|
|
for c in text:
|
|
if '\u0600' <= c <= '\u06FF':
|
|
return "ar"
|
|
return "en"
|
|
|
|
# ─── STATUS ───────────────────────────────────────────
|
|
|
|
@property
|
|
def is_available(self) -> bool:
|
|
return self._sdk_available
|
|
|
|
|
|
# ─── STANDALONE TEST ─────────────────────────────────────
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="Marcus Audio API Test")
|
|
parser.add_argument("--test", action="store_true", help="Run speak tests")
|
|
parser.add_argument("--speak", type=str, help="Speak this text")
|
|
parser.add_argument("--lang", default="auto", help="Language: en, ar, auto")
|
|
parser.add_argument("--record", type=float, default=0, help="Record N seconds")
|
|
args = parser.parse_args()
|
|
|
|
api = AudioAPI()
|
|
|
|
if args.test:
|
|
print("\n--- English built-in ---")
|
|
api.speak("Hello, I am Marcus.", "en")
|
|
time.sleep(1)
|
|
|
|
print("\n--- Arabic Piper ---")
|
|
api.speak("مرحبا، أنا ماركوس", "ar")
|
|
time.sleep(1)
|
|
|
|
print("\n--- Auto-detect ---")
|
|
api.speak("How are you?")
|
|
time.sleep(1)
|
|
api.speak("كيف حالك؟")
|
|
time.sleep(1)
|
|
|
|
print("\n--- Record 3s + playback ---")
|
|
rec = api.record(3.0)
|
|
if rec.std() > 50:
|
|
api.play_pcm(rec)
|
|
print("\nDone.")
|
|
|
|
elif args.speak:
|
|
api.speak(args.speak, args.lang)
|
|
|
|
elif args.record > 0:
|
|
rec = api.record(args.record)
|
|
api.save_recording(rec, f"test_{int(time.time())}")
|
|
if rec.std() > 50:
|
|
api.play_pcm(rec)
|
|
else:
|
|
parser.print_help()
|