398 lines
15 KiB
Python
398 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
API/audio_api.py — Marcus Audio API Layer
|
||
==========================================
|
||
Provides speak() and record() for the Brain layer.
|
||
Brain imports ONLY from this API — never from unitree SDK directly.
|
||
|
||
Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only,
|
||
no MP3/WAV plumbing, no internet). Optional raw-PCM playback path
|
||
via _play_pcm() is kept for future modules that synthesize their
|
||
own audio (e.g. offline Piper).
|
||
Mic: G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono).
|
||
Legacy Hollyland/parec path retained as fallback when
|
||
config_Voice.json has mic.backend="pactl_parec".
|
||
TTS: English only. Arabic is rejected (the G1 firmware silently maps
|
||
Arabic to Chinese, which confuses everyone — if Arabic TTS is ever
|
||
needed again, use a separate offline backend like Piper).
|
||
|
||
Usage:
|
||
from API.audio_api import AudioAPI
|
||
audio = AudioAPI()
|
||
audio.speak("Hello, I am Sanad")
|
||
recording = audio.record(seconds=5)
|
||
audio.play_pcm(recording)
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import os
|
||
import subprocess
|
||
import sys
|
||
import threading
|
||
import time
|
||
import wave
|
||
from logging.handlers import RotatingFileHandler
|
||
|
||
import numpy as np
|
||
|
||
# ─── PATH + CONFIG ───────────────────────────────────────
|
||
# Use the canonical loaders from Core/ so path + config logic lives in one place.
|
||
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
if _PROJECT_DIR not in sys.path:
|
||
sys.path.insert(0, _PROJECT_DIR)
|
||
from Core.env_loader import PROJECT_ROOT
|
||
from Core.config_loader import load_config
|
||
|
||
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
||
os.makedirs(LOG_DIR, exist_ok=True)
|
||
|
||
# logging.basicConfig is idempotent per process: if marcus_voice configured
|
||
# the root logger first, this call is a no-op and both modules share the same
|
||
# RotatingFileHandler (stdlib FileHandlers hold an internal lock, so concurrent
|
||
# writes to voice.log are safe). Rotation caps voice.log at 5 MB × 3 backups.
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||
handlers=[
|
||
RotatingFileHandler(
|
||
os.path.join(LOG_DIR, "voice.log"),
|
||
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
),
|
||
logging.StreamHandler(),
|
||
],
|
||
)
|
||
log = logging.getLogger("audio_api")
|
||
|
||
|
||
# ─── AUDIO API CLASS ─────────────────────────────────────
|
||
|
||
class AudioAPI:
|
||
"""Marcus audio interface — speak + record + play."""
|
||
|
||
def __init__(self):
|
||
self._config = load_config("Voice")
|
||
self._client = None
|
||
self._sdk_available = False
|
||
self._init_sdk()
|
||
|
||
# Config shortcuts
|
||
self._tts = self._config["tts"]
|
||
self._mic = self._config["mic"]
|
||
self._spk = self._config["speaker"]
|
||
self._target_rate = self._tts.get("target_sample_rate", 16000)
|
||
|
||
# Default mic backend: G1 built-in UDP multicast.
|
||
# Set mic.backend="pactl_parec" in config_Voice.json to fall back
|
||
# to the legacy Hollyland/PulseAudio path.
|
||
self._mic_backend = self._mic.get("backend", "builtin_udp")
|
||
self._builtin_mic = None # lazy-initialized on first record()
|
||
|
||
# Built-in TTS wrapper (uses the already-initialized AudioClient).
|
||
# Keeps TTS synchronous so `is_speaking` is meaningful to the voice
|
||
# loop that needs to skip mic input during playback.
|
||
self._tts_engine = None
|
||
if self._sdk_available:
|
||
from Voice.builtin_tts import BuiltinTTS
|
||
self._tts_engine = BuiltinTTS(
|
||
self._client,
|
||
default_speaker_id=self._tts.get("builtin_speaker_id", 0),
|
||
)
|
||
|
||
# Data dir
|
||
data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"])
|
||
os.makedirs(data_dir, exist_ok=True)
|
||
self._data_dir = data_dir
|
||
|
||
# Speaking lock — prevents mic from hearing TTS output
|
||
self._speaking = False
|
||
self._speak_lock = threading.Lock()
|
||
|
||
log.info("%s (mic=%s, tts=%s)",
|
||
self._config["messages"]["ready"],
|
||
self._mic_backend,
|
||
"builtin_ttsmaker" if self._tts_engine else "disabled")
|
||
|
||
def _init_sdk(self):
|
||
"""Initialize Unitree AudioClient."""
|
||
try:
|
||
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
|
||
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
|
||
|
||
dds_iface = self._config["speaker"]["dds_interface"]
|
||
ChannelFactoryInitialize(0, dds_iface)
|
||
|
||
self._client = AudioClient()
|
||
self._client.SetTimeout(10.0)
|
||
self._client.Init()
|
||
self._client.SetVolume(self._config["speaker"]["volume"])
|
||
self._sdk_available = True
|
||
log.info("AudioClient initialized on %s", dds_iface)
|
||
except Exception as e:
|
||
log.error("AudioClient init failed: %s", e)
|
||
self._sdk_available = False
|
||
|
||
# ─── SPEAK ────────────────────────────────────────────
|
||
|
||
def speak(self, text: str, lang: str = "en"):
|
||
"""
|
||
Speak `text` in English through the G1 built-in TTS (TtsMaker).
|
||
|
||
Mutes (flushes) the mic during playback so the voice loop doesn't
|
||
hear the robot's own voice and transcribe itself. `lang` is kept
|
||
in the signature for API compatibility but only `"en"` is accepted
|
||
— non-ASCII text (Arabic) is rejected by BuiltinTTS because the
|
||
G1 firmware silently maps it to Chinese, which nobody wants.
|
||
"""
|
||
if lang and lang != "en":
|
||
log.warning("builtin_tts only supports English; got lang=%r — skipping", lang)
|
||
return
|
||
if self._tts_engine is None:
|
||
log.error("No TTS engine initialized — audio SDK unavailable")
|
||
return
|
||
|
||
log.info("speak: %s", text[:80])
|
||
|
||
with self._speak_lock:
|
||
self._speaking = True
|
||
self._mute_mic()
|
||
try:
|
||
self._tts_engine.speak(text, block=True)
|
||
except Exception as e:
|
||
log.error("%s: %s", self._config["messages"]["error_tts"], e)
|
||
finally:
|
||
# Small tail so the speaker fully finishes before the mic is
|
||
# re-opened for capture
|
||
time.sleep(0.2)
|
||
self._unmute_mic()
|
||
self._speaking = False
|
||
|
||
def _mute_mic(self):
|
||
"""
|
||
Suppress mic input during TTS playback.
|
||
For the UDP built-in mic, flush the buffer so we don't capture any
|
||
echo that's already been queued. For the legacy PulseAudio path,
|
||
actually mute the source.
|
||
"""
|
||
if self._mic_backend == "builtin_udp":
|
||
if self._builtin_mic is not None:
|
||
self._builtin_mic.flush()
|
||
return
|
||
source = self._mic["source_index"]
|
||
subprocess.run(["pactl", "set-source-mute", source, "1"],
|
||
capture_output=True)
|
||
log.debug("Mic muted")
|
||
|
||
def _unmute_mic(self):
|
||
"""Re-enable mic after TTS playback (pactl path only)."""
|
||
if self._mic_backend == "builtin_udp":
|
||
if self._builtin_mic is not None:
|
||
self._builtin_mic.flush()
|
||
return
|
||
source = self._mic["source_index"]
|
||
subprocess.run(["pactl", "set-source-mute", source, "0"],
|
||
capture_output=True)
|
||
log.debug("Mic unmuted")
|
||
|
||
@property
|
||
def is_speaking(self) -> bool:
|
||
"""True while TTS is playing — voice module checks this."""
|
||
return self._speaking
|
||
|
||
def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray:
|
||
"""Linear resample int16 PCM to self._target_rate (16 kHz)."""
|
||
if src_rate == self._target_rate:
|
||
return audio
|
||
tl = int(len(audio) * self._target_rate / src_rate)
|
||
return np.interp(
|
||
np.linspace(0, len(audio), tl, endpoint=False),
|
||
np.arange(len(audio)),
|
||
audio.astype(np.float64),
|
||
).astype(np.int16)
|
||
|
||
# ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
|
||
|
||
def _play_pcm(self, audio_16k: np.ndarray) -> float:
|
||
"""Play 16kHz mono int16 on G1 speaker. Returns duration."""
|
||
if not self._sdk_available:
|
||
log.warning("SDK not available, cannot play audio")
|
||
return 0.0
|
||
|
||
from unitree_sdk2py.g1.audio.g1_audio_api import (
|
||
ROBOT_API_ID_AUDIO_START_PLAY,
|
||
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
||
)
|
||
|
||
app_name = self._spk["app_name"]
|
||
|
||
# Stop previous stream
|
||
self._client._Call(
|
||
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
||
json.dumps({"app_name": app_name}),
|
||
)
|
||
time.sleep(0.3)
|
||
|
||
# Build params — unique stream_id every call
|
||
pcm = audio_16k.tobytes()
|
||
sid = f"s_{int(time.time() * 1000)}"
|
||
param = json.dumps({
|
||
"app_name": app_name,
|
||
"stream_id": sid,
|
||
"sample_rate": self._target_rate,
|
||
"channels": 1,
|
||
"bits_per_sample": 16,
|
||
})
|
||
|
||
# Single call — full buffer
|
||
self._client._CallRequestWithParamAndBin(
|
||
ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm)
|
||
)
|
||
|
||
duration = len(audio_16k) / self._target_rate
|
||
time.sleep(duration + 0.5)
|
||
|
||
self._client._Call(
|
||
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
||
json.dumps({"app_name": app_name}),
|
||
)
|
||
|
||
log.info("Played: %.1fs", duration)
|
||
return duration
|
||
|
||
def play_pcm(self, audio_16k: np.ndarray) -> float:
|
||
"""Public wrapper for playing PCM audio."""
|
||
return self._play_pcm(audio_16k)
|
||
|
||
# ─── MIC RECORDING ───────────────────────────────────
|
||
|
||
def record(self, seconds: float = 5.0) -> np.ndarray:
|
||
"""
|
||
Capture `seconds` of int16 mono 16 kHz PCM.
|
||
|
||
Default backend is the G1 built-in mic (UDP multicast). Set
|
||
mic.backend="pactl_parec" in config_Voice.json to use the
|
||
legacy Hollyland/parec path instead.
|
||
"""
|
||
if self._mic_backend == "builtin_udp":
|
||
return self._record_builtin(seconds)
|
||
return self._record_parec(seconds)
|
||
|
||
def _record_builtin(self, seconds: float) -> np.ndarray:
|
||
"""Built-in mic path — join UDP multicast, read the requested duration."""
|
||
if self._builtin_mic is None:
|
||
from Voice.builtin_mic import BuiltinMic
|
||
mcfg = self._config.get("mic_udp", {})
|
||
self._builtin_mic = BuiltinMic(
|
||
group=mcfg.get("group", "239.168.123.161"),
|
||
port=mcfg.get("port", 5555),
|
||
buf_max=mcfg.get("buffer_max_bytes", 64000),
|
||
)
|
||
self._builtin_mic.start()
|
||
time.sleep(0.2) # let the receiver thread fill in
|
||
|
||
log.info("Recording %.1fs from G1 built-in mic", seconds)
|
||
raw = self._builtin_mic.read_seconds(seconds)
|
||
audio = np.frombuffer(raw, dtype=np.int16)
|
||
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
|
||
if audio.std() < 50:
|
||
log.warning(self._config["messages"]["error_mic"] +
|
||
" — G1 mic silent (check audio service on robot)")
|
||
return audio
|
||
|
||
def _record_parec(self, seconds: float) -> np.ndarray:
|
||
"""Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'."""
|
||
source = self._mic["source_index"]
|
||
rate = str(self._mic["rate"])
|
||
channels = str(self._mic["channels"])
|
||
fmt = self._mic["format"]
|
||
|
||
subprocess.run(["pactl", "set-source-mute", source, "0"], capture_output=True)
|
||
subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True)
|
||
|
||
log.info("Recording %.1fs from mic source %s (parec)", seconds, source)
|
||
proc = None
|
||
raw = b""
|
||
try:
|
||
proc = subprocess.Popen(
|
||
["parec", "-d", source,
|
||
f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"],
|
||
stdout=subprocess.PIPE,
|
||
)
|
||
time.sleep(seconds)
|
||
finally:
|
||
# Always kill parec — an exception in time.sleep (Ctrl-C / signal)
|
||
# would otherwise leave an orphaned recorder process running.
|
||
if proc is not None:
|
||
try:
|
||
proc.terminate()
|
||
raw = proc.stdout.read()
|
||
proc.wait(timeout=1.0)
|
||
except Exception as e:
|
||
log.warning("parec cleanup error: %s", e)
|
||
# Last-resort SIGKILL — suppress only OSError (process
|
||
# already exited) so we don't mask other bugs.
|
||
try:
|
||
proc.kill()
|
||
except OSError:
|
||
pass
|
||
|
||
audio = np.frombuffer(raw, dtype=np.int16)
|
||
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
|
||
if audio.std() < 50:
|
||
log.warning(self._config["messages"]["error_mic"] + " — mic may be silent")
|
||
return audio
|
||
|
||
def save_recording(self, audio: np.ndarray, name: str) -> str:
|
||
"""Save recording to Data/Voice/Recordings/."""
|
||
path = os.path.join(self._data_dir, f"{name}.wav")
|
||
wf = wave.open(path, "wb")
|
||
wf.setnchannels(1)
|
||
wf.setsampwidth(2)
|
||
wf.setframerate(self._target_rate)
|
||
wf.writeframes(audio.tobytes())
|
||
wf.close()
|
||
log.info("Saved: %s", path)
|
||
return path
|
||
|
||
# ─── STATUS ───────────────────────────────────────────
|
||
|
||
@property
|
||
def is_available(self) -> bool:
|
||
return self._sdk_available
|
||
|
||
|
||
# ─── STANDALONE TEST ─────────────────────────────────────
|
||
|
||
if __name__ == "__main__":
|
||
import argparse
|
||
|
||
parser = argparse.ArgumentParser(description="Marcus Audio API Test")
|
||
parser.add_argument("--test", action="store_true", help="Run TTS + record test")
|
||
parser.add_argument("--speak", type=str, help="Speak this English text")
|
||
parser.add_argument("--record", type=float, default=0, help="Record N seconds")
|
||
args = parser.parse_args()
|
||
|
||
api = AudioAPI()
|
||
|
||
if args.test:
|
||
print("\n--- English (TtsMaker) ---")
|
||
api.speak("Hello, I am Sanad.")
|
||
time.sleep(1)
|
||
|
||
print("\n--- Record 3s + playback ---")
|
||
rec = api.record(3.0)
|
||
if rec.std() > 50:
|
||
api.play_pcm(rec)
|
||
print("\nDone.")
|
||
|
||
elif args.speak:
|
||
api.speak(args.speak)
|
||
|
||
elif args.record > 0:
|
||
rec = api.record(args.record)
|
||
api.save_recording(rec, f"test_{int(time.time())}")
|
||
if rec.std() > 50:
|
||
api.play_pcm(rec)
|
||
else:
|
||
parser.print_help()
|