581 lines
19 KiB
Python
581 lines
19 KiB
Python
"""Hardware-agnostic audio I/O for Sanad voice pipelines.
|
|
|
|
Provides a uniform Mic / Speaker interface so the model layer (Gemini
|
|
today, or any future alternative) doesn't need to know which physical
|
|
audio path is active. Pick a pairing via `AudioIO.from_profile()`:
|
|
|
|
builtin → G1 UDP multicast mic + AudioClient.PlayStream
|
|
anker → Anker PowerConf USB mic + speaker (PyAudio)
|
|
hollyland_builtin → Hollyland wireless mic + G1 built-in speaker
|
|
|
|
Mics deliver int16 mono PCM at 16 kHz.
|
|
Speakers accept int16 mono PCM plus a `source_rate` and resample
|
|
internally if the hardware runs at a different rate.
|
|
|
|
Usage:
|
|
|
|
audio = AudioIO.from_profile("builtin", audio_client=ac)
|
|
audio.start()
|
|
try:
|
|
chunk = audio.mic.read_chunk(1024) # mic
|
|
audio.speaker.begin_stream() # speaker
|
|
audio.speaker.send_chunk(pcm_24k, 24000)
|
|
audio.speaker.wait_finish()
|
|
finally:
|
|
audio.stop()
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import socket
|
|
import struct
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Optional, Union
|
|
|
|
import numpy as np
|
|
|
|
try:
|
|
import pyaudio
|
|
_HAS_PYAUDIO = True
|
|
except ImportError:
|
|
pyaudio = None
|
|
_HAS_PYAUDIO = False
|
|
|
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
|
from Project.Sanad.core.logger import get_logger
|
|
|
|
log = get_logger("audio_io")
|
|
|
|
_MIC_CFG = _cfg_section("voice", "mic_udp")
|
|
_SP_CFG = _cfg_section("voice", "speaker")
|
|
|
|
TARGET_MIC_RATE = 16_000
|
|
|
|
_MCAST_GRP = _MIC_CFG.get("group", "239.168.123.161")
|
|
_MCAST_PORT = _MIC_CFG.get("port", 5555)
|
|
_MIC_BUF_MAX = _MIC_CFG.get("buffer_max_bytes", 64_000)
|
|
_MIC_READ_TIMEOUT = _MIC_CFG.get("read_timeout_sec", 0.04)
|
|
|
|
PCMLike = Union[bytes, bytearray, memoryview, np.ndarray]
|
|
|
|
|
|
def _find_g1_local_ip() -> str:
|
|
"""Find the host IPv4 address on the G1's internal 192.168.123.0/24 network."""
|
|
out = subprocess.run(
|
|
["ip", "-4", "-o", "addr"], capture_output=True, text=True,
|
|
).stdout
|
|
for line in out.splitlines():
|
|
for tok in line.split():
|
|
if tok.startswith("192.168.123."):
|
|
return tok.split("/")[0]
|
|
raise RuntimeError("no 192.168.123.x interface found")
|
|
|
|
|
|
def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
|
|
if src_rate == dst_rate or pcm.size == 0:
|
|
return pcm.astype(np.int16, copy=False)
|
|
target_len = max(1, int(len(pcm) * dst_rate / src_rate))
|
|
return np.interp(
|
|
np.linspace(0, len(pcm), target_len, endpoint=False),
|
|
np.arange(len(pcm)),
|
|
pcm.astype(np.float64),
|
|
).astype(np.int16)
|
|
|
|
|
|
def _as_int16_array(pcm: PCMLike) -> np.ndarray:
|
|
if isinstance(pcm, np.ndarray):
|
|
return pcm.astype(np.int16, copy=False)
|
|
return np.frombuffer(bytes(pcm), dtype=np.int16)
|
|
|
|
|
|
# ─── Protocols ────────────────────────────────────────────
|
|
|
|
class Mic(ABC):
|
|
sample_rate: int = TARGET_MIC_RATE
|
|
|
|
@abstractmethod
|
|
def start(self) -> None: ...
|
|
@abstractmethod
|
|
def read_chunk(self, num_bytes: int) -> bytes: ...
|
|
@abstractmethod
|
|
def flush(self) -> None: ...
|
|
@abstractmethod
|
|
def stop(self) -> None: ...
|
|
|
|
|
|
class Speaker(ABC):
|
|
@abstractmethod
|
|
def begin_stream(self) -> None: ...
|
|
|
|
@abstractmethod
|
|
def send_chunk(self, pcm: PCMLike, source_rate: int) -> None:
|
|
"""Queue PCM for playback. `source_rate` is the sample rate of `pcm`."""
|
|
|
|
@abstractmethod
|
|
def wait_finish(self) -> None: ...
|
|
@abstractmethod
|
|
def stop(self) -> None: ...
|
|
|
|
@property
|
|
@abstractmethod
|
|
def interrupted(self) -> bool: ...
|
|
|
|
@property
|
|
def total_sent_sec(self) -> float:
|
|
return 0.0
|
|
|
|
|
|
# ─── G1 built-in (UDP mic + AudioClient speaker) ──────────
|
|
|
|
class BuiltinMic(Mic):
|
|
"""G1 robot's on-board mic published over UDP multicast."""
|
|
|
|
sample_rate = TARGET_MIC_RATE
|
|
|
|
def __init__(self, group: str = _MCAST_GRP, port: int = _MCAST_PORT,
|
|
buf_max: int = _MIC_BUF_MAX):
|
|
self._group = group
|
|
self._port = port
|
|
self._buf_max = buf_max
|
|
self._sock: Optional[socket.socket] = None
|
|
self._buf = bytearray()
|
|
self._lock = threading.Lock()
|
|
self._running = False
|
|
self._thread: Optional[threading.Thread] = None
|
|
|
|
def start(self) -> None:
|
|
local_ip = _find_g1_local_ip()
|
|
self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
self._sock.bind(("", self._port))
|
|
mreq = struct.pack(
|
|
"4s4s",
|
|
socket.inet_aton(self._group),
|
|
socket.inet_aton(local_ip),
|
|
)
|
|
self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
|
|
self._sock.settimeout(1.0)
|
|
self._running = True
|
|
self._thread = threading.Thread(target=self._recv_loop, daemon=True)
|
|
self._thread.start()
|
|
log.info("BuiltinMic joined %s:%d on %s", self._group, self._port, local_ip)
|
|
|
|
def _recv_loop(self) -> None:
|
|
while self._running:
|
|
try:
|
|
data, _ = self._sock.recvfrom(4096)
|
|
with self._lock:
|
|
self._buf.extend(data)
|
|
if len(self._buf) > self._buf_max:
|
|
del self._buf[:len(self._buf) - self._buf_max]
|
|
except socket.timeout:
|
|
continue
|
|
except Exception:
|
|
if self._running:
|
|
time.sleep(0.01)
|
|
|
|
def read_chunk(self, num_bytes: int) -> bytes:
|
|
deadline = time.time() + _MIC_READ_TIMEOUT
|
|
while time.time() < deadline:
|
|
with self._lock:
|
|
if len(self._buf) >= num_bytes:
|
|
chunk = bytes(self._buf[:num_bytes])
|
|
del self._buf[:num_bytes]
|
|
return chunk
|
|
time.sleep(0.003)
|
|
with self._lock:
|
|
avail = len(self._buf)
|
|
if avail > 0:
|
|
chunk = bytes(self._buf[:avail])
|
|
del self._buf[:avail]
|
|
return chunk + b"\x00" * (num_bytes - avail)
|
|
return b"\x00" * num_bytes
|
|
|
|
def flush(self) -> None:
|
|
with self._lock:
|
|
self._buf.clear()
|
|
|
|
def stop(self) -> None:
|
|
self._running = False
|
|
if self._sock is not None:
|
|
try:
|
|
self._sock.close()
|
|
except Exception:
|
|
pass
|
|
self._sock = None
|
|
|
|
|
|
class BuiltinSpeaker(Speaker):
|
|
"""G1 robot's built-in speaker via AudioClient.PlayStream (16 kHz mono)."""
|
|
|
|
HARDWARE_RATE = 16_000
|
|
|
|
def __init__(self, audio_client: Any, app_name: Optional[str] = None):
|
|
self._ac = audio_client
|
|
try:
|
|
self._ac.SetVolume(100)
|
|
except Exception:
|
|
log.warning("BuiltinSpeaker.SetVolume failed")
|
|
self._app_name = app_name or _SP_CFG.get("app_name", "sanad")
|
|
self._begin_pause = _SP_CFG.get("begin_stream_pause_sec", 0.15)
|
|
self._finish_margin = _SP_CFG.get("wait_finish_margin_sec", 0.3)
|
|
self._stop_flag = threading.Event()
|
|
self._stream_id: Optional[str] = None
|
|
self._total_sent = 0.0
|
|
self._play_start = 0.0
|
|
|
|
def _stop_play_api(self) -> None:
|
|
try:
|
|
from unitree_sdk2py.g1.audio.g1_audio_api import (
|
|
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
|
)
|
|
self._ac._Call(
|
|
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
|
json.dumps({"app_name": self._app_name}),
|
|
)
|
|
except Exception:
|
|
log.warning("BuiltinSpeaker AUDIO_STOP_PLAY failed")
|
|
|
|
def begin_stream(self) -> None:
|
|
self._stop_flag.clear()
|
|
self._stop_play_api()
|
|
time.sleep(self._begin_pause)
|
|
self._stream_id = f"s_{int(time.time() * 1000)}"
|
|
self._total_sent = 0.0
|
|
self._play_start = time.time()
|
|
|
|
def send_chunk(self, pcm: PCMLike, source_rate: int) -> None:
|
|
if self._stop_flag.is_set():
|
|
return
|
|
arr = _as_int16_array(pcm)
|
|
if arr.size < 10:
|
|
return
|
|
hw = _resample_int16(arr, source_rate, self.HARDWARE_RATE)
|
|
self._ac.PlayStream(self._app_name, self._stream_id, hw.tobytes())
|
|
self._total_sent += len(hw) / self.HARDWARE_RATE
|
|
|
|
def wait_finish(self) -> None:
|
|
elapsed = time.time() - self._play_start
|
|
remaining = self._total_sent - elapsed + self._finish_margin
|
|
waited = 0.0
|
|
while waited < remaining and not self._stop_flag.is_set():
|
|
time.sleep(0.1)
|
|
waited += 0.1
|
|
self._stop_play_api()
|
|
|
|
def stop(self) -> None:
|
|
self._stop_flag.set()
|
|
self._stop_play_api()
|
|
|
|
@property
|
|
def interrupted(self) -> bool:
|
|
return self._stop_flag.is_set()
|
|
|
|
@property
|
|
def total_sent_sec(self) -> float:
|
|
return self._total_sent
|
|
|
|
|
|
# ─── PyAudio-backed mic/speaker ───────────────────────────
|
|
|
|
class _PyAudioMic(Mic):
|
|
"""Shared base for PulseAudio/ALSA input — matches device by name pattern."""
|
|
|
|
sample_rate = TARGET_MIC_RATE
|
|
|
|
def __init__(self, device_pattern: str, label: str,
|
|
frames_per_buffer: int = 512):
|
|
if not _HAS_PYAUDIO:
|
|
raise RuntimeError(f"{label}Mic requires pyaudio")
|
|
self._device_pattern = device_pattern
|
|
self._label = label
|
|
self._frames_per_buffer = frames_per_buffer
|
|
self._pa: Optional["pyaudio.PyAudio"] = None
|
|
self._stream = None
|
|
self._running = False
|
|
self._buf = bytearray()
|
|
self._lock = threading.Lock()
|
|
self._thread: Optional[threading.Thread] = None
|
|
|
|
def _resolve_device_index(self) -> Optional[int]:
|
|
if self._pa is None:
|
|
return None
|
|
patterns = [p.strip().lower()
|
|
for p in self._device_pattern.split(",") if p.strip()]
|
|
for i in range(self._pa.get_device_count()):
|
|
info = self._pa.get_device_info_by_index(i)
|
|
if info.get("maxInputChannels", 0) <= 0:
|
|
continue
|
|
name_lower = str(info.get("name", "")).lower()
|
|
if any(n in name_lower for n in patterns):
|
|
return i
|
|
return None
|
|
|
|
def start(self) -> None:
|
|
self._pa = pyaudio.PyAudio()
|
|
idx = self._resolve_device_index()
|
|
self._stream = self._pa.open(
|
|
format=pyaudio.paInt16,
|
|
channels=1,
|
|
rate=self.sample_rate,
|
|
input=True,
|
|
input_device_index=idx,
|
|
frames_per_buffer=self._frames_per_buffer,
|
|
)
|
|
self._running = True
|
|
self._thread = threading.Thread(target=self._recv_loop, daemon=True)
|
|
self._thread.start()
|
|
log.info("%sMic started (device_index=%s)", self._label, idx)
|
|
|
|
def _recv_loop(self) -> None:
|
|
while self._running:
|
|
try:
|
|
data = self._stream.read(
|
|
self._frames_per_buffer, exception_on_overflow=False,
|
|
)
|
|
with self._lock:
|
|
self._buf.extend(data)
|
|
if len(self._buf) > _MIC_BUF_MAX:
|
|
del self._buf[:len(self._buf) - _MIC_BUF_MAX]
|
|
except Exception:
|
|
if self._running:
|
|
time.sleep(0.01)
|
|
|
|
def read_chunk(self, num_bytes: int) -> bytes:
|
|
deadline = time.time() + _MIC_READ_TIMEOUT
|
|
while time.time() < deadline:
|
|
with self._lock:
|
|
if len(self._buf) >= num_bytes:
|
|
chunk = bytes(self._buf[:num_bytes])
|
|
del self._buf[:num_bytes]
|
|
return chunk
|
|
time.sleep(0.003)
|
|
with self._lock:
|
|
avail = len(self._buf)
|
|
if avail > 0:
|
|
chunk = bytes(self._buf[:avail])
|
|
del self._buf[:avail]
|
|
return chunk + b"\x00" * (num_bytes - avail)
|
|
return b"\x00" * num_bytes
|
|
|
|
def flush(self) -> None:
|
|
with self._lock:
|
|
self._buf.clear()
|
|
|
|
def stop(self) -> None:
|
|
self._running = False
|
|
if self._stream is not None:
|
|
try:
|
|
self._stream.stop_stream()
|
|
self._stream.close()
|
|
except Exception:
|
|
pass
|
|
self._stream = None
|
|
if self._pa is not None:
|
|
try:
|
|
self._pa.terminate()
|
|
except Exception:
|
|
pass
|
|
self._pa = None
|
|
|
|
|
|
class AnkerMic(_PyAudioMic):
|
|
def __init__(self):
|
|
super().__init__(device_pattern="powerconf,anker", label="Anker")
|
|
|
|
|
|
class HollylandMic(_PyAudioMic):
|
|
def __init__(self):
|
|
super().__init__(
|
|
device_pattern="hollyland,wireless_microphone",
|
|
label="Hollyland",
|
|
)
|
|
|
|
|
|
class _PyAudioSpeaker(Speaker):
|
|
"""PulseAudio/ALSA output — opens a fresh output stream per begin_stream()."""
|
|
|
|
def __init__(self, device_pattern: str, label: str):
|
|
if not _HAS_PYAUDIO:
|
|
raise RuntimeError(f"{label}Speaker requires pyaudio")
|
|
self._device_pattern = device_pattern
|
|
self._label = label
|
|
self._pa: Optional["pyaudio.PyAudio"] = None
|
|
self._stream = None
|
|
self._stream_rate: Optional[int] = None
|
|
self._stop_flag = threading.Event()
|
|
self._total_sent = 0.0
|
|
|
|
def _resolve_device_index(self) -> Optional[int]:
|
|
if self._pa is None:
|
|
return None
|
|
patterns = [p.strip().lower()
|
|
for p in self._device_pattern.split(",") if p.strip()]
|
|
for i in range(self._pa.get_device_count()):
|
|
info = self._pa.get_device_info_by_index(i)
|
|
if info.get("maxOutputChannels", 0) <= 0:
|
|
continue
|
|
name_lower = str(info.get("name", "")).lower()
|
|
if any(n in name_lower for n in patterns):
|
|
return i
|
|
return None
|
|
|
|
def _open_stream(self, rate: int) -> None:
|
|
idx = self._resolve_device_index()
|
|
self._stream = self._pa.open(
|
|
format=pyaudio.paInt16,
|
|
channels=1,
|
|
rate=rate,
|
|
output=True,
|
|
output_device_index=idx,
|
|
)
|
|
self._stream_rate = rate
|
|
log.info("%sSpeaker output opened (device_index=%s, rate=%d)",
|
|
self._label, idx, rate)
|
|
|
|
def begin_stream(self) -> None:
|
|
self._stop_flag.clear()
|
|
self._total_sent = 0.0
|
|
if self._pa is None:
|
|
self._pa = pyaudio.PyAudio()
|
|
|
|
def send_chunk(self, pcm: PCMLike, source_rate: int) -> None:
|
|
if self._stop_flag.is_set():
|
|
return
|
|
arr = _as_int16_array(pcm)
|
|
if arr.size < 10:
|
|
return
|
|
if self._pa is None:
|
|
self._pa = pyaudio.PyAudio()
|
|
if self._stream is None or self._stream_rate != source_rate:
|
|
if self._stream is not None:
|
|
try:
|
|
self._stream.stop_stream()
|
|
self._stream.close()
|
|
except Exception:
|
|
pass
|
|
self._stream = None
|
|
self._open_stream(source_rate)
|
|
try:
|
|
self._stream.write(arr.tobytes())
|
|
self._total_sent += len(arr) / source_rate
|
|
except Exception as exc:
|
|
log.warning("%sSpeaker write failed: %s", self._label, exc)
|
|
|
|
def wait_finish(self) -> None:
|
|
if self._stream is not None:
|
|
try:
|
|
self._stream.stop_stream()
|
|
self._stream.close()
|
|
except Exception:
|
|
pass
|
|
self._stream = None
|
|
self._stream_rate = None
|
|
|
|
def stop(self) -> None:
|
|
self._stop_flag.set()
|
|
self.wait_finish()
|
|
|
|
@property
|
|
def interrupted(self) -> bool:
|
|
return self._stop_flag.is_set()
|
|
|
|
@property
|
|
def total_sent_sec(self) -> float:
|
|
return self._total_sent
|
|
|
|
|
|
class AnkerSpeaker(_PyAudioSpeaker):
|
|
def __init__(self):
|
|
super().__init__(device_pattern="powerconf,anker", label="Anker")
|
|
|
|
|
|
# ─── Factory ──────────────────────────────────────────────
|
|
|
|
_PROFILE_ALIASES = {
|
|
"builtin": "builtin",
|
|
"g1_builtin": "builtin",
|
|
"g1": "builtin",
|
|
"anker": "anker",
|
|
"anker_powerconf": "anker",
|
|
"hollyland": "hollyland_builtin",
|
|
"hollyland_builtin": "hollyland_builtin",
|
|
}
|
|
|
|
SUPPORTED_PROFILES = ("builtin", "anker", "hollyland_builtin")
|
|
|
|
|
|
@dataclass
|
|
class AudioIO:
|
|
mic: Mic
|
|
speaker: Speaker
|
|
profile_id: str = field(default="builtin")
|
|
|
|
def start(self) -> None:
|
|
self.mic.start()
|
|
|
|
def stop(self) -> None:
|
|
try:
|
|
self.speaker.stop()
|
|
except Exception:
|
|
log.warning("AudioIO speaker.stop failed", exc_info=True)
|
|
try:
|
|
self.mic.stop()
|
|
except Exception:
|
|
log.warning("AudioIO mic.stop failed", exc_info=True)
|
|
|
|
@classmethod
|
|
def from_profile(
|
|
cls,
|
|
profile_id: str,
|
|
*,
|
|
audio_client: Optional[Any] = None,
|
|
) -> "AudioIO":
|
|
"""Build an AudioIO for the requested profile.
|
|
|
|
`audio_client` is the initialised `unitree_sdk2py` `AudioClient` and
|
|
is required for any profile that speaks through the G1's on-board
|
|
speaker (`builtin`, `hollyland_builtin`).
|
|
"""
|
|
raw = (profile_id or "").strip().lower()
|
|
resolved = _PROFILE_ALIASES.get(raw)
|
|
if resolved is None:
|
|
raise ValueError(
|
|
f"unknown audio profile {profile_id!r}; "
|
|
f"supported: {', '.join(SUPPORTED_PROFILES)}"
|
|
)
|
|
|
|
if resolved == "builtin":
|
|
if audio_client is None:
|
|
raise ValueError(
|
|
"profile 'builtin' requires audio_client (G1 AudioClient)"
|
|
)
|
|
return cls(
|
|
mic=BuiltinMic(),
|
|
speaker=BuiltinSpeaker(audio_client),
|
|
profile_id=resolved,
|
|
)
|
|
if resolved == "anker":
|
|
return cls(
|
|
mic=AnkerMic(),
|
|
speaker=AnkerSpeaker(),
|
|
profile_id=resolved,
|
|
)
|
|
if resolved == "hollyland_builtin":
|
|
if audio_client is None:
|
|
raise ValueError(
|
|
"profile 'hollyland_builtin' uses the G1 speaker — "
|
|
"requires audio_client"
|
|
)
|
|
return cls(
|
|
mic=HollylandMic(),
|
|
speaker=BuiltinSpeaker(audio_client),
|
|
profile_id=resolved,
|
|
)
|
|
raise AssertionError(f"unhandled resolved profile: {resolved!r}")
|