Marcus/API/audio_api.py

398 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
API/audio_api.py — Marcus Audio API Layer
==========================================
Provides speak() and record() for the Brain layer.
Brain imports ONLY from this API — never from unitree SDK directly.
Speaker: Unitree built-in TtsMaker (G1 on-board engine, English only,
no MP3/WAV plumbing, no internet). Optional raw-PCM playback path
via _play_pcm() is kept for future modules that synthesize their
own audio (e.g. offline Piper).
Mic: G1 built-in mic (UDP multicast 239.168.123.161:5555, 16 kHz mono).
Legacy Hollyland/parec path retained as fallback when
config_Voice.json has mic.backend="pactl_parec".
TTS: English only. Arabic is rejected (the G1 firmware silently maps
Arabic to Chinese, which confuses everyone — if Arabic TTS is ever
needed again, use a separate offline backend like Piper).
Usage:
from API.audio_api import AudioAPI
audio = AudioAPI()
audio.speak("Hello, I am Sanad")
recording = audio.record(seconds=5)
audio.play_pcm(recording)
"""
import json
import logging
import os
import subprocess
import sys
import threading
import time
import wave
from logging.handlers import RotatingFileHandler
import numpy as np
# ─── PATH + CONFIG ───────────────────────────────────────
# Use the canonical loaders from Core/ so path + config logic lives in one place.
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
# logging.basicConfig is idempotent per process: if marcus_voice configured
# the root logger first, this call is a no-op and both modules share the same
# RotatingFileHandler (stdlib FileHandlers hold an internal lock, so concurrent
# writes to voice.log are safe). Rotation caps voice.log at 5 MB × 3 backups.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
RotatingFileHandler(
os.path.join(LOG_DIR, "voice.log"),
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
),
logging.StreamHandler(),
],
)
log = logging.getLogger("audio_api")
# ─── AUDIO API CLASS ─────────────────────────────────────
class AudioAPI:
"""Marcus audio interface — speak + record + play."""
def __init__(self):
self._config = load_config("Voice")
self._client = None
self._sdk_available = False
self._init_sdk()
# Config shortcuts
self._tts = self._config["tts"]
self._mic = self._config["mic"]
self._spk = self._config["speaker"]
self._target_rate = self._tts.get("target_sample_rate", 16000)
# Default mic backend: G1 built-in UDP multicast.
# Set mic.backend="pactl_parec" in config_Voice.json to fall back
# to the legacy Hollyland/PulseAudio path.
self._mic_backend = self._mic.get("backend", "builtin_udp")
self._builtin_mic = None # lazy-initialized on first record()
# Built-in TTS wrapper (uses the already-initialized AudioClient).
# Keeps TTS synchronous so `is_speaking` is meaningful to the voice
# loop that needs to skip mic input during playback.
self._tts_engine = None
if self._sdk_available:
from Voice.builtin_tts import BuiltinTTS
self._tts_engine = BuiltinTTS(
self._client,
default_speaker_id=self._tts.get("builtin_speaker_id", 0),
)
# Data dir
data_dir = os.path.join(PROJECT_ROOT, self._config["audio"]["data_dir"])
os.makedirs(data_dir, exist_ok=True)
self._data_dir = data_dir
# Speaking lock — prevents mic from hearing TTS output
self._speaking = False
self._speak_lock = threading.Lock()
log.info("%s (mic=%s, tts=%s)",
self._config["messages"]["ready"],
self._mic_backend,
"builtin_ttsmaker" if self._tts_engine else "disabled")
def _init_sdk(self):
"""Initialize Unitree AudioClient."""
try:
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
dds_iface = self._config["speaker"]["dds_interface"]
ChannelFactoryInitialize(0, dds_iface)
self._client = AudioClient()
self._client.SetTimeout(10.0)
self._client.Init()
self._client.SetVolume(self._config["speaker"]["volume"])
self._sdk_available = True
log.info("AudioClient initialized on %s", dds_iface)
except Exception as e:
log.error("AudioClient init failed: %s", e)
self._sdk_available = False
# ─── SPEAK ────────────────────────────────────────────
def speak(self, text: str, lang: str = "en"):
"""
Speak `text` in English through the G1 built-in TTS (TtsMaker).
Mutes (flushes) the mic during playback so the voice loop doesn't
hear the robot's own voice and transcribe itself. `lang` is kept
in the signature for API compatibility but only `"en"` is accepted
— non-ASCII text (Arabic) is rejected by BuiltinTTS because the
G1 firmware silently maps it to Chinese, which nobody wants.
"""
if lang and lang != "en":
log.warning("builtin_tts only supports English; got lang=%r — skipping", lang)
return
if self._tts_engine is None:
log.error("No TTS engine initialized — audio SDK unavailable")
return
log.info("speak: %s", text[:80])
with self._speak_lock:
self._speaking = True
self._mute_mic()
try:
self._tts_engine.speak(text, block=True)
except Exception as e:
log.error("%s: %s", self._config["messages"]["error_tts"], e)
finally:
# Small tail so the speaker fully finishes before the mic is
# re-opened for capture
time.sleep(0.2)
self._unmute_mic()
self._speaking = False
def _mute_mic(self):
"""
Suppress mic input during TTS playback.
For the UDP built-in mic, flush the buffer so we don't capture any
echo that's already been queued. For the legacy PulseAudio path,
actually mute the source.
"""
if self._mic_backend == "builtin_udp":
if self._builtin_mic is not None:
self._builtin_mic.flush()
return
source = self._mic["source_index"]
subprocess.run(["pactl", "set-source-mute", source, "1"],
capture_output=True)
log.debug("Mic muted")
def _unmute_mic(self):
"""Re-enable mic after TTS playback (pactl path only)."""
if self._mic_backend == "builtin_udp":
if self._builtin_mic is not None:
self._builtin_mic.flush()
return
source = self._mic["source_index"]
subprocess.run(["pactl", "set-source-mute", source, "0"],
capture_output=True)
log.debug("Mic unmuted")
@property
def is_speaking(self) -> bool:
"""True while TTS is playing — voice module checks this."""
return self._speaking
def _resample(self, audio: np.ndarray, src_rate: int) -> np.ndarray:
"""Linear resample int16 PCM to self._target_rate (16 kHz)."""
if src_rate == self._target_rate:
return audio
tl = int(len(audio) * self._target_rate / src_rate)
return np.interp(
np.linspace(0, len(audio), tl, endpoint=False),
np.arange(len(audio)),
audio.astype(np.float64),
).astype(np.int16)
# ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
def _play_pcm(self, audio_16k: np.ndarray) -> float:
"""Play 16kHz mono int16 on G1 speaker. Returns duration."""
if not self._sdk_available:
log.warning("SDK not available, cannot play audio")
return 0.0
from unitree_sdk2py.g1.audio.g1_audio_api import (
ROBOT_API_ID_AUDIO_START_PLAY,
ROBOT_API_ID_AUDIO_STOP_PLAY,
)
app_name = self._spk["app_name"]
# Stop previous stream
self._client._Call(
ROBOT_API_ID_AUDIO_STOP_PLAY,
json.dumps({"app_name": app_name}),
)
time.sleep(0.3)
# Build params — unique stream_id every call
pcm = audio_16k.tobytes()
sid = f"s_{int(time.time() * 1000)}"
param = json.dumps({
"app_name": app_name,
"stream_id": sid,
"sample_rate": self._target_rate,
"channels": 1,
"bits_per_sample": 16,
})
# Single call — full buffer
self._client._CallRequestWithParamAndBin(
ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm)
)
duration = len(audio_16k) / self._target_rate
time.sleep(duration + 0.5)
self._client._Call(
ROBOT_API_ID_AUDIO_STOP_PLAY,
json.dumps({"app_name": app_name}),
)
log.info("Played: %.1fs", duration)
return duration
def play_pcm(self, audio_16k: np.ndarray) -> float:
"""Public wrapper for playing PCM audio."""
return self._play_pcm(audio_16k)
# ─── MIC RECORDING ───────────────────────────────────
def record(self, seconds: float = 5.0) -> np.ndarray:
"""
Capture `seconds` of int16 mono 16 kHz PCM.
Default backend is the G1 built-in mic (UDP multicast). Set
mic.backend="pactl_parec" in config_Voice.json to use the
legacy Hollyland/parec path instead.
"""
if self._mic_backend == "builtin_udp":
return self._record_builtin(seconds)
return self._record_parec(seconds)
def _record_builtin(self, seconds: float) -> np.ndarray:
"""Built-in mic path — join UDP multicast, read the requested duration."""
if self._builtin_mic is None:
from Voice.builtin_mic import BuiltinMic
mcfg = self._config.get("mic_udp", {})
self._builtin_mic = BuiltinMic(
group=mcfg.get("group", "239.168.123.161"),
port=mcfg.get("port", 5555),
buf_max=mcfg.get("buffer_max_bytes", 64000),
)
self._builtin_mic.start()
time.sleep(0.2) # let the receiver thread fill in
log.info("Recording %.1fs from G1 built-in mic", seconds)
raw = self._builtin_mic.read_seconds(seconds)
audio = np.frombuffer(raw, dtype=np.int16)
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
if audio.std() < 50:
log.warning(self._config["messages"]["error_mic"] +
" — G1 mic silent (check audio service on robot)")
return audio
def _record_parec(self, seconds: float) -> np.ndarray:
"""Legacy Hollyland/PulseAudio path — only used if mic.backend='pactl_parec'."""
source = self._mic["source_index"]
rate = str(self._mic["rate"])
channels = str(self._mic["channels"])
fmt = self._mic["format"]
subprocess.run(["pactl", "set-source-mute", source, "0"], capture_output=True)
subprocess.run(["pactl", "set-source-volume", source, "100%"], capture_output=True)
log.info("Recording %.1fs from mic source %s (parec)", seconds, source)
proc = None
raw = b""
try:
proc = subprocess.Popen(
["parec", "-d", source,
f"--format={fmt}", f"--rate={rate}", f"--channels={channels}", "--raw"],
stdout=subprocess.PIPE,
)
time.sleep(seconds)
finally:
# Always kill parec — an exception in time.sleep (Ctrl-C / signal)
# would otherwise leave an orphaned recorder process running.
if proc is not None:
try:
proc.terminate()
raw = proc.stdout.read()
proc.wait(timeout=1.0)
except Exception as e:
log.warning("parec cleanup error: %s", e)
# Last-resort SIGKILL — suppress only OSError (process
# already exited) so we don't mask other bugs.
try:
proc.kill()
except OSError:
pass
audio = np.frombuffer(raw, dtype=np.int16)
log.info("Recorded: %d samples, std=%.0f", len(audio), audio.std())
if audio.std() < 50:
log.warning(self._config["messages"]["error_mic"] + " — mic may be silent")
return audio
def save_recording(self, audio: np.ndarray, name: str) -> str:
"""Save recording to Data/Voice/Recordings/."""
path = os.path.join(self._data_dir, f"{name}.wav")
wf = wave.open(path, "wb")
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(self._target_rate)
wf.writeframes(audio.tobytes())
wf.close()
log.info("Saved: %s", path)
return path
# ─── STATUS ───────────────────────────────────────────
@property
def is_available(self) -> bool:
return self._sdk_available
# ─── STANDALONE TEST ─────────────────────────────────────
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Marcus Audio API Test")
parser.add_argument("--test", action="store_true", help="Run TTS + record test")
parser.add_argument("--speak", type=str, help="Speak this English text")
parser.add_argument("--record", type=float, default=0, help="Record N seconds")
args = parser.parse_args()
api = AudioAPI()
if args.test:
print("\n--- English (TtsMaker) ---")
api.speak("Hello, I am Sanad.")
time.sleep(1)
print("\n--- Record 3s + playback ---")
rec = api.record(3.0)
if rec.std() > 50:
api.play_pcm(rec)
print("\nDone.")
elif args.speak:
api.speak(args.speak)
elif args.record > 0:
rec = api.record(args.record)
api.save_recording(rec, f"test_{int(time.time())}")
if rec.std() > 50:
api.play_pcm(rec)
else:
parser.print_help()