Marcus/Voice/audio_io.py

346 lines
11 KiB
Python

"""Hardware-agnostic audio I/O for Marcus voice pipelines.
Direct port of /home/zedx/Robotics_workspace/yslootahtech/Project/Sanad/voice/audio_io.py,
with USB mic/speaker profiles (Anker/Hollyland) removed — Marcus only uses the
G1 on-board profile. Class names and method signatures match Sanad verbatim so
the rest of the Gemini brain code lifts over cleanly.
Mics deliver int16 mono PCM at 16 kHz.
Speakers accept int16 mono PCM plus a `source_rate` and resample internally.
Usage:
audio = AudioIO.from_profile("builtin", audio_client=ac)
audio.start()
try:
chunk = audio.mic.read_chunk(1024)
audio.speaker.begin_stream()
audio.speaker.send_chunk(pcm_24k, 24000)
audio.speaker.wait_finish()
finally:
audio.stop()
"""
from __future__ import annotations
import json
import logging
import os
import socket
import struct
import subprocess
import sys
import threading
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Optional, Union
import numpy as np
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
try:
from Core.config_loader import load_config
_VCFG = load_config("Voice") or {}
except Exception:
_VCFG = {}
log = logging.getLogger("audio_io")
_MIC_CFG = _VCFG.get("mic_udp", {}) or {}
_SP_CFG = _VCFG.get("speaker", {}) or {}
TARGET_MIC_RATE = 16_000
_MCAST_GRP = _MIC_CFG.get("group", "239.168.123.161")
_MCAST_PORT = int(_MIC_CFG.get("port", 5555))
_MIC_BUF_MAX = int(_MIC_CFG.get("buffer_max_bytes", 64_000))
_MIC_READ_TIMEOUT = float(_MIC_CFG.get("read_timeout_sec", 0.04))
PCMLike = Union[bytes, bytearray, memoryview, np.ndarray]
def _find_g1_local_ip() -> str:
"""Find the host IPv4 address on the G1's internal 192.168.123.0/24 network."""
out = subprocess.run(
["ip", "-4", "-o", "addr"], capture_output=True, text=True,
).stdout
for line in out.splitlines():
for tok in line.split():
if tok.startswith("192.168.123."):
return tok.split("/")[0]
raise RuntimeError("no 192.168.123.x interface found")
def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
if src_rate == dst_rate or pcm.size == 0:
return pcm.astype(np.int16, copy=False)
target_len = max(1, int(len(pcm) * dst_rate / src_rate))
return np.interp(
np.linspace(0, len(pcm), target_len, endpoint=False),
np.arange(len(pcm)),
pcm.astype(np.float64),
).astype(np.int16)
def _as_int16_array(pcm: PCMLike) -> np.ndarray:
if isinstance(pcm, np.ndarray):
return pcm.astype(np.int16, copy=False)
return np.frombuffer(bytes(pcm), dtype=np.int16)
# ─── Protocols ────────────────────────────────────────────
class Mic(ABC):
sample_rate: int = TARGET_MIC_RATE
@abstractmethod
def start(self) -> None: ...
@abstractmethod
def read_chunk(self, num_bytes: int) -> bytes: ...
@abstractmethod
def flush(self) -> None: ...
@abstractmethod
def stop(self) -> None: ...
class Speaker(ABC):
@abstractmethod
def begin_stream(self) -> None: ...
@abstractmethod
def send_chunk(self, pcm: PCMLike, source_rate: int) -> None:
"""Queue PCM for playback. `source_rate` is the sample rate of `pcm`."""
@abstractmethod
def wait_finish(self) -> None: ...
@abstractmethod
def stop(self) -> None: ...
@property
@abstractmethod
def interrupted(self) -> bool: ...
@property
def total_sent_sec(self) -> float:
return 0.0
# ─── G1 built-in (UDP mic + AudioClient speaker) ──────────
class BuiltinMic(Mic):
"""G1 robot's on-board mic published over UDP multicast."""
sample_rate = TARGET_MIC_RATE
def __init__(self, group: str = _MCAST_GRP, port: int = _MCAST_PORT,
buf_max: int = _MIC_BUF_MAX):
self._group = group
self._port = port
self._buf_max = buf_max
self._sock = None # type: Optional[socket.socket]
self._buf = bytearray()
self._lock = threading.Lock()
self._running = False
self._thread = None # type: Optional[threading.Thread]
def start(self) -> None:
if self._running:
return
local_ip = _find_g1_local_ip()
self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self._sock.bind(("", self._port))
mreq = struct.pack(
"4s4s",
socket.inet_aton(self._group),
socket.inet_aton(local_ip),
)
self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
self._sock.settimeout(1.0)
self._running = True
self._thread = threading.Thread(target=self._recv_loop, daemon=True)
self._thread.start()
log.info("BuiltinMic joined %s:%d on %s", self._group, self._port, local_ip)
def _recv_loop(self) -> None:
while self._running:
try:
data, _ = self._sock.recvfrom(4096)
with self._lock:
self._buf.extend(data)
if len(self._buf) > self._buf_max:
del self._buf[:len(self._buf) - self._buf_max]
except socket.timeout:
continue
except Exception:
if self._running:
time.sleep(0.01)
def read_chunk(self, num_bytes: int) -> bytes:
deadline = time.time() + _MIC_READ_TIMEOUT
while time.time() < deadline:
with self._lock:
if len(self._buf) >= num_bytes:
chunk = bytes(self._buf[:num_bytes])
del self._buf[:num_bytes]
return chunk
time.sleep(0.003)
with self._lock:
avail = len(self._buf)
if avail > 0:
chunk = bytes(self._buf[:avail])
del self._buf[:avail]
return chunk + b"\x00" * (num_bytes - avail)
return b"\x00" * num_bytes
def flush(self) -> None:
with self._lock:
self._buf.clear()
def stop(self) -> None:
self._running = False
if self._sock is not None:
try:
self._sock.close()
except Exception:
pass
self._sock = None
class BuiltinSpeaker(Speaker):
"""G1 robot's built-in speaker via AudioClient.PlayStream (16 kHz mono)."""
HARDWARE_RATE = 16_000
def __init__(self, audio_client: Any, app_name: Optional[str] = None):
self._ac = audio_client
try:
self._ac.SetVolume(100)
except Exception:
log.warning("BuiltinSpeaker.SetVolume failed")
self._app_name = app_name or _SP_CFG.get("app_name", "marcus")
self._begin_pause = float(_SP_CFG.get("begin_stream_pause_sec", 0.15))
self._finish_margin = float(_SP_CFG.get("wait_finish_margin_sec", 0.3))
self._stop_flag = threading.Event()
self._stream_id = None # type: Optional[str]
self._total_sent = 0.0
self._play_start = 0.0
def _stop_play_api(self) -> None:
try:
from unitree_sdk2py.g1.audio.g1_audio_api import (
ROBOT_API_ID_AUDIO_STOP_PLAY,
)
self._ac._Call(
ROBOT_API_ID_AUDIO_STOP_PLAY,
json.dumps({"app_name": self._app_name}),
)
except Exception:
log.warning("BuiltinSpeaker AUDIO_STOP_PLAY failed")
def begin_stream(self) -> None:
self._stop_flag.clear()
self._stop_play_api()
time.sleep(self._begin_pause)
self._stream_id = "s_{}".format(int(time.time() * 1000))
self._total_sent = 0.0
self._play_start = time.time()
def send_chunk(self, pcm: PCMLike, source_rate: int) -> None:
if self._stop_flag.is_set():
return
arr = _as_int16_array(pcm)
if arr.size < 10:
return
hw = _resample_int16(arr, int(source_rate), self.HARDWARE_RATE)
self._ac.PlayStream(self._app_name, self._stream_id, hw.tobytes())
self._total_sent += len(hw) / float(self.HARDWARE_RATE)
def wait_finish(self) -> None:
elapsed = time.time() - self._play_start
remaining = self._total_sent - elapsed + self._finish_margin
waited = 0.0
while waited < remaining and not self._stop_flag.is_set():
time.sleep(0.1)
waited += 0.1
self._stop_play_api()
def stop(self) -> None:
self._stop_flag.set()
self._stop_play_api()
@property
def interrupted(self) -> bool:
return self._stop_flag.is_set()
@property
def total_sent_sec(self) -> float:
return self._total_sent
# ─── AudioIO factory ──────────────────────────────────────
_PROFILE_ALIASES = {
"builtin": "builtin",
"g1": "builtin",
"g1_builtin": "builtin",
}
SUPPORTED_PROFILES = ("builtin",)
@dataclass
class AudioIO:
mic: Mic
speaker: Speaker
profile_id: str = field(default="builtin")
def start(self) -> None:
self.mic.start()
def stop(self) -> None:
try:
self.speaker.stop()
except Exception:
log.warning("AudioIO speaker.stop failed", exc_info=True)
try:
self.mic.stop()
except Exception:
log.warning("AudioIO mic.stop failed", exc_info=True)
@classmethod
def from_profile(
cls,
profile_id: str,
*,
audio_client: Optional[Any] = None,
) -> "AudioIO":
"""Build an AudioIO for the requested profile.
`audio_client` is the initialised `unitree_sdk2py` `AudioClient` and
is required for the `builtin` profile (the G1 on-board speaker).
"""
raw = (profile_id or "").strip().lower()
resolved = _PROFILE_ALIASES.get(raw)
if resolved is None:
raise ValueError(
"unknown audio profile {!r}; supported: {}".format(
profile_id, ", ".join(SUPPORTED_PROFILES),
)
)
if resolved == "builtin":
if audio_client is None:
raise ValueError(
"profile 'builtin' requires audio_client (G1 AudioClient)"
)
return cls(
mic=BuiltinMic(),
speaker=BuiltinSpeaker(audio_client),
profile_id=resolved,
)
raise AssertionError("unhandled resolved profile: {!r}".format(resolved))