#!/usr/bin/env python3 """ record_phrases.py — Build Saqr's audio library for AudioClient.PlayStream. Two modes: DEFAULT (TTS capture) — the script calls AudioClient.TtsMaker for each phrase and simultaneously records the G1 speaker output via the mic. Zero speaking required. Voice is the G1's own TTS voice, but at runtime PlayStream skips the firmware synthesis buffer so it plays back ~200–700 ms faster. --mic (your voice) — the script shows each phrase, counts down, and records whatever the mic hears. You speak each line yourself. Both modes save 16 kHz mono int16 WAVs under ``assets/audio//.wav`` — exactly what ``robot/audio_player`` expects. Usage (run on the robot or any machine on the G1 subnet): python3 scripts/record_phrases.py # TTS capture, all 8 python3 scripts/record_phrases.py --only safe,helmet # just those two python3 scripts/record_phrases.py --iface enp3s0 # custom DDS iface python3 scripts/record_phrases.py --mic # your voice instead python3 scripts/record_phrases.py --play # verify each clip by replaying Requires ``unitree_sdk2py`` in the active conda env (TTS-capture and --play). """ from __future__ import annotations import argparse import socket import struct import subprocess import sys import threading import time import wave from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent ASSETS_ROOT = PROJECT_ROOT / "assets" / "audio" MCAST_GRP = "239.168.123.161" MCAST_PORT = 5555 RATE = 16000 # (category, key, text). PHRASES = [ ("fixed", "ready", "Saqr is running. Press R2 plus X to start."), ("fixed", "deactivated", "Saqr deactivated."), ("fixed", "no_camera", "Camera not connected. Please plug in the camera and try again."), ("fixed", "safe", "Safe to enter. Have a good day."), ("fixed", "unsafe_generic", "Please stop. Wear your proper safety equipment."), ("unsafe_missing", "helmet", "Please stop. Wear your proper safety equipment. You are missing helmet."), ("unsafe_missing", "vest", "Please stop. Wear your proper safety equipment. You are missing vest."), ("unsafe_missing", "helmet_vest", "Please stop. Wear your proper safety equipment. You are missing helmet and vest."), ] # ── mic capture ───────────────────────────────────────────────────────────── def find_local_ip() -> str: """Pick the first 192.168.123.x address on this machine.""" out = subprocess.run( ["ip", "-4", "-o", "addr"], capture_output=True, text=True ).stdout for line in out.splitlines(): for tok in line.split(): if tok.startswith("192.168.123."): return tok.split("/")[0] raise RuntimeError( "No 192.168.123.x address on this machine — connect to the G1 network first." ) def _open_mcast_socket(local_ip: str) -> socket.socket: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) s.bind(("", MCAST_PORT)) mreq = struct.pack("4s4s", socket.inet_aton(MCAST_GRP), socket.inet_aton(local_ip)) s.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq) return s def record_multicast_fixed(seconds: float) -> bytes: """Blocking mic capture for exactly ``seconds`` (used by --mic mode).""" local_ip = find_local_ip() sock = _open_mcast_socket(local_ip) sock.settimeout(2.0) target_bytes = int(RATE * 2 * seconds) buf = bytearray() t0 = time.time() try: while len(buf) < target_bytes and time.time() - t0 < seconds + 5: try: data, _ = sock.recvfrom(4096) buf.extend(data) except socket.timeout: continue finally: sock.close() return bytes(buf) def record_while_tts_plays(ac, text: str, speaker_id: int, capture_seconds: float) -> bytes: """Start mic capture, call TtsMaker, capture for ``capture_seconds``, stop.""" local_ip = find_local_ip() buf = bytearray() stop_flag = threading.Event() def _mic(): sock = _open_mcast_socket(local_ip) sock.settimeout(0.5) try: while not stop_flag.is_set(): try: data, _ = sock.recvfrom(4096) buf.extend(data) except socket.timeout: continue finally: sock.close() t = threading.Thread(target=_mic, daemon=True) t.start() time.sleep(0.3) # let the mic socket settle before TTS fires try: code = ac.TtsMaker(text, speaker_id) except Exception as e: print(f" [FAIL] TtsMaker raised: {e}") stop_flag.set() t.join(timeout=1.0) return b"" if code != 0: print(f" [FAIL] TtsMaker rc={code} — retry in 2s…") time.sleep(2.0) try: code = ac.TtsMaker(text, speaker_id) except Exception as e: print(f" [FAIL] TtsMaker retry raised: {e}") stop_flag.set() t.join(timeout=1.0) return b"" if code != 0: print(f" [FAIL] TtsMaker retry rc={code}") stop_flag.set() t.join(timeout=1.0) return b"" time.sleep(capture_seconds) stop_flag.set() t.join(timeout=1.0) return bytes(buf) # ── processing ────────────────────────────────────────────────────────────── def rms_and_peak(pcm: bytes): import numpy as np a = np.frombuffer(pcm, dtype=np.int16) if a.size == 0: return 0.0, 0 rms = float(np.sqrt(np.mean(a.astype(np.float32) ** 2))) peak = int(np.abs(a).max()) return rms, peak def trim_leading_silence(pcm: bytes, threshold: int = 500, head_ms: int = 80) -> bytes: """Cut silence before the first sample above threshold; keep ``head_ms`` lead-in.""" import numpy as np a = np.frombuffer(pcm, dtype=np.int16) above = np.where(np.abs(a) > threshold)[0] if above.size == 0: return pcm start = max(0, int(above[0]) - int(head_ms / 1000.0 * RATE)) return a[start:].tobytes() def trim_trailing_silence(pcm: bytes, threshold: int = 500, tail_ms: int = 150) -> bytes: import numpy as np a = np.frombuffer(pcm, dtype=np.int16) above = np.where(np.abs(a) > threshold)[0] if above.size == 0: return pcm end = int(above[-1]) + int(tail_ms / 1000.0 * RATE) end = min(end, a.size) return a[:end].tobytes() def save_wav(pcm: bytes, path: Path) -> None: path.parent.mkdir(parents=True, exist_ok=True) with wave.open(str(path), "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(RATE) wf.writeframes(pcm) def estimate_tts_duration(text: str) -> float: """Match RobotController's pacing estimate (0.12 s/char, min 2.5 s).""" return max(2.5, len(text) * 0.12) # ── optional playback verification ────────────────────────────────────────── def play_on_g1(ac, path: Path) -> None: with wave.open(str(path), "rb") as wf: pcm = wf.readframes(wf.getnframes()) CHUNK = 96000 sid = f"verify_{int(time.time() * 1000)}" offset = 0 while offset < len(pcm): chunk = pcm[offset:offset + CHUNK] ac.PlayStream("saqr_verify", sid, chunk) offset += len(chunk) time.sleep(len(chunk) / (RATE * 2) / 2) time.sleep(len(pcm) / (RATE * 2) + 0.3) try: ac.PlayStop("saqr_verify") except Exception: pass def countdown(seconds: int) -> None: for i in range(seconds, 0, -1): print(f" starting in {i}...", end="\r", flush=True) time.sleep(1) print(" " + " " * 30, end="\r") # ── main ──────────────────────────────────────────────────────────────────── def main(): ap = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) ap.add_argument("--mic", action="store_true", help="record YOUR voice via the G1 mic instead of TtsMaker") ap.add_argument("--duration", type=float, default=None, help="seconds to record per phrase (default: estimate per phrase)") ap.add_argument("--pause", type=float, default=1.0, help="seconds of pause between phrases (default 1)") ap.add_argument("--only", default=None, help="comma-separated keys to record (e.g. 'safe,helmet,vest')") ap.add_argument("--iface", default="eth0", help="DDS iface for TtsMaker / PlayStream (default eth0)") ap.add_argument("--speaker-id", type=int, default=2, help="TtsMaker speaker_id (2 = English on current firmware)") ap.add_argument("--volume", type=int, default=100, help="G1 speaker volume 0–100 (default 100)") ap.add_argument("--play", action="store_true", help="play each saved clip back on the G1 via PlayStream") ap.add_argument("--countdown", type=int, default=2, help="countdown seconds before --mic recordings (default 2)") ap.add_argument("--no-trim", action="store_true", help="don't auto-trim leading/trailing silence") ap.add_argument("--threshold", type=int, default=500, help="amplitude threshold for silence trim (default 500)") args = ap.parse_args() only = set(args.only.split(",")) if args.only else None targets = [(c, k, t) for (c, k, t) in PHRASES if only is None or k in only] if not targets: print(f"No phrases match --only={args.only!r}. Known keys: " f"{', '.join(k for _, k, _ in PHRASES)}") sys.exit(1) try: local_ip = find_local_ip() except RuntimeError as e: print(f"[FATAL] {e}") sys.exit(1) mode = "mic" if args.mic else "tts" # Init AudioClient for TTS mode or --play. ac = None if mode == "tts" or args.play: try: from unitree_sdk2py.core.channel import ChannelFactoryInitialize from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient except ImportError as e: print(f"[FATAL] unitree_sdk2py not available in this env: {e}") sys.exit(1) print(f"[init] ChannelFactoryInitialize(0, {args.iface!r})") ChannelFactoryInitialize(0, args.iface) ac = AudioClient() ac.SetTimeout(10.0) ac.Init() try: ac.SetVolume(args.volume) except Exception as e: print(f"[init][WARN] SetVolume failed: {e}") print(f"[init] AudioClient ready (speaker_id={args.speaker_id} volume={args.volume})") print() print("=" * 60) print(f" Saqr phrase recorder — {len(targets)} clip(s)") print(f" Mode: {mode.upper()} " f"({'TtsMaker → mic' if mode == 'tts' else 'your voice → mic'})") print(f" Duration: {('auto (per phrase)' if args.duration is None else f'{args.duration}s')}") print(f" Pause: {args.pause}s between phrases") print(f" Trim: {'off' if args.no_trim else f'on (threshold={args.threshold})'}") print(f" Output: {ASSETS_ROOT}") print(f" Mic feed: {MCAST_GRP}:{MCAST_PORT} (local IP: {local_ip})") print("=" * 60) print() if mode == "mic": print("Stand within ~1 m of the G1. When you see '>>> SPEAK NOW <<<' say") print("the SAY: line clearly. Target rms 500–5000.") else: print("The G1 will speak each phrase via TtsMaker while the script records") print("through the mic. Stay quiet during capture; any room noise you hear") print("will end up baked into the clip.") print() try: input("Press Enter to start…") except EOFError: return print() saved = [] for i, (cat, key, text) in enumerate(targets, 1): out = ASSETS_ROOT / cat / f"{key}.wav" print(f"── [{i}/{len(targets)} {cat}/{key}] ──") print(f" SAY: {text}") if mode == "mic": if args.countdown > 0: countdown(args.countdown) dur = args.duration if args.duration is not None else 5.0 print(f" >>> SPEAK NOW — {dur}s <<<", flush=True) pcm = record_multicast_fixed(dur) else: dur = args.duration if args.duration is not None else \ estimate_tts_duration(text) + 1.5 print(f" TtsMaker playing… (capturing {dur:.1f}s)", flush=True) pcm = record_while_tts_plays(ac, text, args.speaker_id, dur) if not pcm: print(f" [FAIL] no audio captured") print() continue if not args.no_trim: pcm = trim_leading_silence(pcm, threshold=args.threshold) pcm = trim_trailing_silence(pcm, threshold=args.threshold) rms, peak = rms_and_peak(pcm) dur_s = len(pcm) / 2.0 / RATE save_wav(pcm, out) if rms < 400: marker = f" ⚠ TOO QUIET — re-record with --only {key}" elif peak > 30000: marker = " ⚠ CLIPPING" else: marker = " ✓" print(f" saved → {out.relative_to(PROJECT_ROOT)} " f"({dur_s:.1f}s rms={rms:.0f} peak={peak}){marker}") saved.append((cat, key, out, dur_s, rms, peak)) if args.play and ac is not None: print(f" playing back…") try: play_on_g1(ac, out) except Exception as e: print(f" [play] failed: {e}") if i < len(targets) and args.pause > 0: time.sleep(args.pause) print() # Summary print("=" * 60) print(f" Done — {len(saved)}/{len(targets)} clip(s) saved") print("=" * 60) bad = [] for cat, key, path, dur, rms, peak in saved: warn = " ⚠ low level" if rms < 400 else (" ⚠ clipping" if peak > 30000 else "") if warn: bad.append(key) print(f" {cat}/{key:20s} {dur:4.1f}s rms={rms:5.0f} peak={peak:5d}{warn}") print() if bad: print(f"Re-record the flagged ones with:") print(f" python3 scripts/record_phrases.py --only {','.join(bad)}") print() print("Next:") print(" 1. (on robot) sudo systemctl restart saqr-bridge") print(" 2. expect: [audio_player] loaded N clip(s): ...") if __name__ == "__main__": main()