409 lines
15 KiB
Python
Executable File
409 lines
15 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""
|
||
record_phrases.py — Build Saqr's audio library for AudioClient.PlayStream.
|
||
|
||
Two modes:
|
||
|
||
DEFAULT (TTS capture) — the script calls AudioClient.TtsMaker for each
|
||
phrase and simultaneously records the G1 speaker
|
||
output via the mic. Zero speaking required. Voice
|
||
is the G1's own TTS voice, but at runtime
|
||
PlayStream skips the firmware synthesis buffer so
|
||
it plays back ~200–700 ms faster.
|
||
|
||
--mic (your voice) — the script shows each phrase, counts down, and
|
||
records whatever the mic hears. You speak each
|
||
line yourself.
|
||
|
||
Both modes save 16 kHz mono int16 WAVs under
|
||
``assets/audio/<category>/<key>.wav`` — exactly what ``robot/audio_player``
|
||
expects.
|
||
|
||
Usage (run on the robot or any machine on the G1 subnet):
|
||
|
||
python3 scripts/record_phrases.py # TTS capture, all 8
|
||
python3 scripts/record_phrases.py --only safe,helmet # just those two
|
||
python3 scripts/record_phrases.py --iface enp3s0 # custom DDS iface
|
||
python3 scripts/record_phrases.py --mic # your voice instead
|
||
python3 scripts/record_phrases.py --play # verify each clip by replaying
|
||
|
||
Requires ``unitree_sdk2py`` in the active conda env (TTS-capture and --play).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import socket
|
||
import struct
|
||
import subprocess
|
||
import sys
|
||
import threading
|
||
import time
|
||
import wave
|
||
from pathlib import Path
|
||
|
||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
ASSETS_ROOT = PROJECT_ROOT / "assets" / "audio"
|
||
|
||
MCAST_GRP = "239.168.123.161"
|
||
MCAST_PORT = 5555
|
||
RATE = 16000
|
||
|
||
# (category, key, text).
|
||
PHRASES = [
|
||
("fixed", "ready",
|
||
"Saqr is running. Press R2 plus X to start."),
|
||
("fixed", "deactivated",
|
||
"Saqr deactivated."),
|
||
("fixed", "no_camera",
|
||
"Camera not connected. Please plug in the camera and try again."),
|
||
("fixed", "safe",
|
||
"Safe to enter. Have a good day."),
|
||
("fixed", "unsafe_generic",
|
||
"Please stop. Wear your proper safety equipment."),
|
||
("unsafe_missing", "helmet",
|
||
"Please stop. Wear your proper safety equipment. You are missing helmet."),
|
||
("unsafe_missing", "vest",
|
||
"Please stop. Wear your proper safety equipment. You are missing vest."),
|
||
("unsafe_missing", "helmet_vest",
|
||
"Please stop. Wear your proper safety equipment. You are missing helmet and vest."),
|
||
]
|
||
|
||
|
||
# ── mic capture ─────────────────────────────────────────────────────────────
|
||
def find_local_ip() -> str:
|
||
"""Pick the first 192.168.123.x address on this machine."""
|
||
out = subprocess.run(
|
||
["ip", "-4", "-o", "addr"], capture_output=True, text=True
|
||
).stdout
|
||
for line in out.splitlines():
|
||
for tok in line.split():
|
||
if tok.startswith("192.168.123."):
|
||
return tok.split("/")[0]
|
||
raise RuntimeError(
|
||
"No 192.168.123.x address on this machine — connect to the G1 network first."
|
||
)
|
||
|
||
|
||
def _open_mcast_socket(local_ip: str) -> socket.socket:
|
||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||
s.bind(("", MCAST_PORT))
|
||
mreq = struct.pack("4s4s", socket.inet_aton(MCAST_GRP), socket.inet_aton(local_ip))
|
||
s.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
|
||
return s
|
||
|
||
|
||
def record_multicast_fixed(seconds: float) -> bytes:
|
||
"""Blocking mic capture for exactly ``seconds`` (used by --mic mode)."""
|
||
local_ip = find_local_ip()
|
||
sock = _open_mcast_socket(local_ip)
|
||
sock.settimeout(2.0)
|
||
target_bytes = int(RATE * 2 * seconds)
|
||
buf = bytearray()
|
||
t0 = time.time()
|
||
try:
|
||
while len(buf) < target_bytes and time.time() - t0 < seconds + 5:
|
||
try:
|
||
data, _ = sock.recvfrom(4096)
|
||
buf.extend(data)
|
||
except socket.timeout:
|
||
continue
|
||
finally:
|
||
sock.close()
|
||
return bytes(buf)
|
||
|
||
|
||
def record_while_tts_plays(ac, text: str, speaker_id: int,
|
||
capture_seconds: float) -> bytes:
|
||
"""Start mic capture, call TtsMaker, capture for ``capture_seconds``, stop."""
|
||
local_ip = find_local_ip()
|
||
buf = bytearray()
|
||
stop_flag = threading.Event()
|
||
|
||
def _mic():
|
||
sock = _open_mcast_socket(local_ip)
|
||
sock.settimeout(0.5)
|
||
try:
|
||
while not stop_flag.is_set():
|
||
try:
|
||
data, _ = sock.recvfrom(4096)
|
||
buf.extend(data)
|
||
except socket.timeout:
|
||
continue
|
||
finally:
|
||
sock.close()
|
||
|
||
t = threading.Thread(target=_mic, daemon=True)
|
||
t.start()
|
||
time.sleep(0.3) # let the mic socket settle before TTS fires
|
||
|
||
try:
|
||
code = ac.TtsMaker(text, speaker_id)
|
||
except Exception as e:
|
||
print(f" [FAIL] TtsMaker raised: {e}")
|
||
stop_flag.set()
|
||
t.join(timeout=1.0)
|
||
return b""
|
||
if code != 0:
|
||
print(f" [FAIL] TtsMaker rc={code} — retry in 2s…")
|
||
time.sleep(2.0)
|
||
try:
|
||
code = ac.TtsMaker(text, speaker_id)
|
||
except Exception as e:
|
||
print(f" [FAIL] TtsMaker retry raised: {e}")
|
||
stop_flag.set()
|
||
t.join(timeout=1.0)
|
||
return b""
|
||
if code != 0:
|
||
print(f" [FAIL] TtsMaker retry rc={code}")
|
||
stop_flag.set()
|
||
t.join(timeout=1.0)
|
||
return b""
|
||
|
||
time.sleep(capture_seconds)
|
||
stop_flag.set()
|
||
t.join(timeout=1.0)
|
||
return bytes(buf)
|
||
|
||
|
||
# ── processing ──────────────────────────────────────────────────────────────
|
||
def rms_and_peak(pcm: bytes):
|
||
import numpy as np
|
||
a = np.frombuffer(pcm, dtype=np.int16)
|
||
if a.size == 0:
|
||
return 0.0, 0
|
||
rms = float(np.sqrt(np.mean(a.astype(np.float32) ** 2)))
|
||
peak = int(np.abs(a).max())
|
||
return rms, peak
|
||
|
||
|
||
def trim_leading_silence(pcm: bytes, threshold: int = 500,
|
||
head_ms: int = 80) -> bytes:
|
||
"""Cut silence before the first sample above threshold; keep ``head_ms`` lead-in."""
|
||
import numpy as np
|
||
a = np.frombuffer(pcm, dtype=np.int16)
|
||
above = np.where(np.abs(a) > threshold)[0]
|
||
if above.size == 0:
|
||
return pcm
|
||
start = max(0, int(above[0]) - int(head_ms / 1000.0 * RATE))
|
||
return a[start:].tobytes()
|
||
|
||
|
||
def trim_trailing_silence(pcm: bytes, threshold: int = 500,
|
||
tail_ms: int = 150) -> bytes:
|
||
import numpy as np
|
||
a = np.frombuffer(pcm, dtype=np.int16)
|
||
above = np.where(np.abs(a) > threshold)[0]
|
||
if above.size == 0:
|
||
return pcm
|
||
end = int(above[-1]) + int(tail_ms / 1000.0 * RATE)
|
||
end = min(end, a.size)
|
||
return a[:end].tobytes()
|
||
|
||
|
||
def save_wav(pcm: bytes, path: Path) -> None:
|
||
path.parent.mkdir(parents=True, exist_ok=True)
|
||
with wave.open(str(path), "wb") as wf:
|
||
wf.setnchannels(1)
|
||
wf.setsampwidth(2)
|
||
wf.setframerate(RATE)
|
||
wf.writeframes(pcm)
|
||
|
||
|
||
def estimate_tts_duration(text: str) -> float:
|
||
"""Match RobotController's pacing estimate (0.12 s/char, min 2.5 s)."""
|
||
return max(2.5, len(text) * 0.12)
|
||
|
||
|
||
# ── optional playback verification ──────────────────────────────────────────
|
||
def play_on_g1(ac, path: Path) -> None:
|
||
with wave.open(str(path), "rb") as wf:
|
||
pcm = wf.readframes(wf.getnframes())
|
||
CHUNK = 96000
|
||
sid = f"verify_{int(time.time() * 1000)}"
|
||
offset = 0
|
||
while offset < len(pcm):
|
||
chunk = pcm[offset:offset + CHUNK]
|
||
ac.PlayStream("saqr_verify", sid, chunk)
|
||
offset += len(chunk)
|
||
time.sleep(len(chunk) / (RATE * 2) / 2)
|
||
time.sleep(len(pcm) / (RATE * 2) + 0.3)
|
||
try:
|
||
ac.PlayStop("saqr_verify")
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
def countdown(seconds: int) -> None:
|
||
for i in range(seconds, 0, -1):
|
||
print(f" starting in {i}...", end="\r", flush=True)
|
||
time.sleep(1)
|
||
print(" " + " " * 30, end="\r")
|
||
|
||
|
||
# ── main ────────────────────────────────────────────────────────────────────
|
||
def main():
|
||
ap = argparse.ArgumentParser(
|
||
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter,
|
||
)
|
||
ap.add_argument("--mic", action="store_true",
|
||
help="record YOUR voice via the G1 mic instead of TtsMaker")
|
||
ap.add_argument("--duration", type=float, default=None,
|
||
help="seconds to record per phrase (default: estimate per phrase)")
|
||
ap.add_argument("--pause", type=float, default=1.0,
|
||
help="seconds of pause between phrases (default 1)")
|
||
ap.add_argument("--only", default=None,
|
||
help="comma-separated keys to record (e.g. 'safe,helmet,vest')")
|
||
ap.add_argument("--iface", default="eth0",
|
||
help="DDS iface for TtsMaker / PlayStream (default eth0)")
|
||
ap.add_argument("--speaker-id", type=int, default=2,
|
||
help="TtsMaker speaker_id (2 = English on current firmware)")
|
||
ap.add_argument("--volume", type=int, default=100,
|
||
help="G1 speaker volume 0–100 (default 100)")
|
||
ap.add_argument("--play", action="store_true",
|
||
help="play each saved clip back on the G1 via PlayStream")
|
||
ap.add_argument("--countdown", type=int, default=2,
|
||
help="countdown seconds before --mic recordings (default 2)")
|
||
ap.add_argument("--no-trim", action="store_true",
|
||
help="don't auto-trim leading/trailing silence")
|
||
ap.add_argument("--threshold", type=int, default=500,
|
||
help="amplitude threshold for silence trim (default 500)")
|
||
args = ap.parse_args()
|
||
|
||
only = set(args.only.split(",")) if args.only else None
|
||
targets = [(c, k, t) for (c, k, t) in PHRASES if only is None or k in only]
|
||
if not targets:
|
||
print(f"No phrases match --only={args.only!r}. Known keys: "
|
||
f"{', '.join(k for _, k, _ in PHRASES)}")
|
||
sys.exit(1)
|
||
|
||
try:
|
||
local_ip = find_local_ip()
|
||
except RuntimeError as e:
|
||
print(f"[FATAL] {e}")
|
||
sys.exit(1)
|
||
|
||
mode = "mic" if args.mic else "tts"
|
||
|
||
# Init AudioClient for TTS mode or --play.
|
||
ac = None
|
||
if mode == "tts" or args.play:
|
||
try:
|
||
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
|
||
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
|
||
except ImportError as e:
|
||
print(f"[FATAL] unitree_sdk2py not available in this env: {e}")
|
||
sys.exit(1)
|
||
print(f"[init] ChannelFactoryInitialize(0, {args.iface!r})")
|
||
ChannelFactoryInitialize(0, args.iface)
|
||
ac = AudioClient()
|
||
ac.SetTimeout(10.0)
|
||
ac.Init()
|
||
try:
|
||
ac.SetVolume(args.volume)
|
||
except Exception as e:
|
||
print(f"[init][WARN] SetVolume failed: {e}")
|
||
print(f"[init] AudioClient ready (speaker_id={args.speaker_id} volume={args.volume})")
|
||
print()
|
||
|
||
print("=" * 60)
|
||
print(f" Saqr phrase recorder — {len(targets)} clip(s)")
|
||
print(f" Mode: {mode.upper()} "
|
||
f"({'TtsMaker → mic' if mode == 'tts' else 'your voice → mic'})")
|
||
print(f" Duration: {('auto (per phrase)' if args.duration is None else f'{args.duration}s')}")
|
||
print(f" Pause: {args.pause}s between phrases")
|
||
print(f" Trim: {'off' if args.no_trim else f'on (threshold={args.threshold})'}")
|
||
print(f" Output: {ASSETS_ROOT}")
|
||
print(f" Mic feed: {MCAST_GRP}:{MCAST_PORT} (local IP: {local_ip})")
|
||
print("=" * 60)
|
||
print()
|
||
|
||
if mode == "mic":
|
||
print("Stand within ~1 m of the G1. When you see '>>> SPEAK NOW <<<' say")
|
||
print("the SAY: line clearly. Target rms 500–5000.")
|
||
else:
|
||
print("The G1 will speak each phrase via TtsMaker while the script records")
|
||
print("through the mic. Stay quiet during capture; any room noise you hear")
|
||
print("will end up baked into the clip.")
|
||
print()
|
||
try:
|
||
input("Press Enter to start…")
|
||
except EOFError:
|
||
return
|
||
print()
|
||
|
||
saved = []
|
||
for i, (cat, key, text) in enumerate(targets, 1):
|
||
out = ASSETS_ROOT / cat / f"{key}.wav"
|
||
print(f"── [{i}/{len(targets)} {cat}/{key}] ──")
|
||
print(f" SAY: {text}")
|
||
|
||
if mode == "mic":
|
||
if args.countdown > 0:
|
||
countdown(args.countdown)
|
||
dur = args.duration if args.duration is not None else 5.0
|
||
print(f" >>> SPEAK NOW — {dur}s <<<", flush=True)
|
||
pcm = record_multicast_fixed(dur)
|
||
else:
|
||
dur = args.duration if args.duration is not None else \
|
||
estimate_tts_duration(text) + 1.5
|
||
print(f" TtsMaker playing… (capturing {dur:.1f}s)", flush=True)
|
||
pcm = record_while_tts_plays(ac, text, args.speaker_id, dur)
|
||
|
||
if not pcm:
|
||
print(f" [FAIL] no audio captured")
|
||
print()
|
||
continue
|
||
|
||
if not args.no_trim:
|
||
pcm = trim_leading_silence(pcm, threshold=args.threshold)
|
||
pcm = trim_trailing_silence(pcm, threshold=args.threshold)
|
||
|
||
rms, peak = rms_and_peak(pcm)
|
||
dur_s = len(pcm) / 2.0 / RATE
|
||
save_wav(pcm, out)
|
||
|
||
if rms < 400:
|
||
marker = f" ⚠ TOO QUIET — re-record with --only {key}"
|
||
elif peak > 30000:
|
||
marker = " ⚠ CLIPPING"
|
||
else:
|
||
marker = " ✓"
|
||
print(f" saved → {out.relative_to(PROJECT_ROOT)} "
|
||
f"({dur_s:.1f}s rms={rms:.0f} peak={peak}){marker}")
|
||
saved.append((cat, key, out, dur_s, rms, peak))
|
||
|
||
if args.play and ac is not None:
|
||
print(f" playing back…")
|
||
try:
|
||
play_on_g1(ac, out)
|
||
except Exception as e:
|
||
print(f" [play] failed: {e}")
|
||
|
||
if i < len(targets) and args.pause > 0:
|
||
time.sleep(args.pause)
|
||
print()
|
||
|
||
# Summary
|
||
print("=" * 60)
|
||
print(f" Done — {len(saved)}/{len(targets)} clip(s) saved")
|
||
print("=" * 60)
|
||
bad = []
|
||
for cat, key, path, dur, rms, peak in saved:
|
||
warn = " ⚠ low level" if rms < 400 else (" ⚠ clipping" if peak > 30000 else "")
|
||
if warn:
|
||
bad.append(key)
|
||
print(f" {cat}/{key:20s} {dur:4.1f}s rms={rms:5.0f} peak={peak:5d}{warn}")
|
||
print()
|
||
if bad:
|
||
print(f"Re-record the flagged ones with:")
|
||
print(f" python3 scripts/record_phrases.py --only {','.join(bad)}")
|
||
print()
|
||
print("Next:")
|
||
print(" 1. (on robot) sudo systemctl restart saqr-bridge")
|
||
print(" 2. expect: [audio_player] loaded N clip(s): ...")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|