Saqr/scripts/record_phrases.py

#!/usr/bin/env python3
"""
record_phrases.py — Build Saqr's audio library for AudioClient.PlayStream.

Two modes:

  DEFAULT (TTS capture) — the script calls AudioClient.TtsMaker for each
                         phrase and simultaneously records the G1 speaker
                         output via the mic. Zero speaking required. Voice
                         is the G1's own TTS voice, but at runtime
                         PlayStream skips the firmware synthesis buffer so
                         it plays back ~200–700 ms faster.

  --mic (your voice)    — the script shows each phrase, counts down, and
                         records whatever the mic hears. You speak each
                         line yourself.

Both modes save 16 kHz mono int16 WAVs under
``assets/audio/<category>/<key>.wav`` — exactly what ``robot/audio_player``
expects.

Usage (run on the robot or any machine on the G1 subnet):

    python3 scripts/record_phrases.py                      # TTS capture, all 8
    python3 scripts/record_phrases.py --only safe,helmet   # just those two
    python3 scripts/record_phrases.py --iface enp3s0       # custom DDS iface
    python3 scripts/record_phrases.py --mic                # your voice instead
    python3 scripts/record_phrases.py --play               # verify each clip by replaying

Requires ``unitree_sdk2py`` in the active conda env (TTS-capture and --play).
"""
from __future__ import annotations

import argparse
import socket
import struct
import subprocess
import sys
import threading
import time
import wave
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent
ASSETS_ROOT  = PROJECT_ROOT / "assets" / "audio"

MCAST_GRP  = "239.168.123.161"
MCAST_PORT = 5555
RATE       = 16000

# (category, key, text).
PHRASES = [
    ("fixed",          "ready",
     "Saqr is running. Press R2 plus X to start."),
    ("fixed",          "deactivated",
     "Saqr deactivated."),
    ("fixed",          "no_camera",
     "Camera not connected. Please plug in the camera and try again."),
    ("fixed",          "safe",
     "Safe to enter. Have a good day."),
    ("fixed",          "unsafe_generic",
     "Please stop. Wear your proper safety equipment."),
    ("unsafe_missing", "helmet",
     "Please stop. Wear your proper safety equipment. You are missing helmet."),
    ("unsafe_missing", "vest",
     "Please stop. Wear your proper safety equipment. You are missing vest."),
    ("unsafe_missing", "helmet_vest",
     "Please stop. Wear your proper safety equipment. You are missing helmet and vest."),
]


# ── mic capture ─────────────────────────────────────────────────────────────
def find_local_ip() -> str:
    """Pick the first 192.168.123.x address on this machine."""
    out = subprocess.run(
        ["ip", "-4", "-o", "addr"], capture_output=True, text=True
    ).stdout
    for line in out.splitlines():
        for tok in line.split():
            if tok.startswith("192.168.123."):
                return tok.split("/")[0]
    raise RuntimeError(
        "No 192.168.123.x address on this machine — connect to the G1 network first."
    )


def _open_mcast_socket(local_ip: str) -> socket.socket:
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
    s.bind(("", MCAST_PORT))
    mreq = struct.pack("4s4s", socket.inet_aton(MCAST_GRP), socket.inet_aton(local_ip))
    s.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
    return s


def record_multicast_fixed(seconds: float) -> bytes:
    """Blocking mic capture for exactly ``seconds`` (used by --mic mode)."""
    local_ip = find_local_ip()
    sock = _open_mcast_socket(local_ip)
    sock.settimeout(2.0)
    target_bytes = int(RATE * 2 * seconds)
    buf = bytearray()
    t0 = time.time()
    try:
        while len(buf) < target_bytes and time.time() - t0 < seconds + 5:
            try:
                data, _ = sock.recvfrom(4096)
                buf.extend(data)
            except socket.timeout:
                continue
    finally:
        sock.close()
    return bytes(buf)


def record_while_tts_plays(ac, text: str, speaker_id: int,
                           capture_seconds: float) -> bytes:
    """Start mic capture, call TtsMaker, capture for ``capture_seconds``, stop."""
    local_ip = find_local_ip()
    buf = bytearray()
    stop_flag = threading.Event()

    def _mic():
        sock = _open_mcast_socket(local_ip)
        sock.settimeout(0.5)
        try:
            while not stop_flag.is_set():
                try:
                    data, _ = sock.recvfrom(4096)
                    buf.extend(data)
                except socket.timeout:
                    continue
        finally:
            sock.close()

    t = threading.Thread(target=_mic, daemon=True)
    t.start()
    time.sleep(0.3)   # let the mic socket settle before TTS fires

    try:
        code = ac.TtsMaker(text, speaker_id)
    except Exception as e:
        print(f"   [FAIL] TtsMaker raised: {e}")
        stop_flag.set()
        t.join(timeout=1.0)
        return b""
    if code != 0:
        print(f"   [FAIL] TtsMaker rc={code} — retry in 2s…")
        time.sleep(2.0)
        try:
            code = ac.TtsMaker(text, speaker_id)
        except Exception as e:
            print(f"   [FAIL] TtsMaker retry raised: {e}")
            stop_flag.set()
            t.join(timeout=1.0)
            return b""
        if code != 0:
            print(f"   [FAIL] TtsMaker retry rc={code}")
            stop_flag.set()
            t.join(timeout=1.0)
            return b""

    time.sleep(capture_seconds)
    stop_flag.set()
    t.join(timeout=1.0)
    return bytes(buf)


# ── processing ──────────────────────────────────────────────────────────────
def rms_and_peak(pcm: bytes):
    import numpy as np
    a = np.frombuffer(pcm, dtype=np.int16)
    if a.size == 0:
        return 0.0, 0
    rms = float(np.sqrt(np.mean(a.astype(np.float32) ** 2)))
    peak = int(np.abs(a).max())
    return rms, peak


def trim_leading_silence(pcm: bytes, threshold: int = 500,
                         head_ms: int = 80) -> bytes:
    """Cut silence before the first sample above threshold; keep ``head_ms`` lead-in."""
    import numpy as np
    a = np.frombuffer(pcm, dtype=np.int16)
    above = np.where(np.abs(a) > threshold)[0]
    if above.size == 0:
        return pcm
    start = max(0, int(above[0]) - int(head_ms / 1000.0 * RATE))
    return a[start:].tobytes()


def trim_trailing_silence(pcm: bytes, threshold: int = 500,
                          tail_ms: int = 150) -> bytes:
    import numpy as np
    a = np.frombuffer(pcm, dtype=np.int16)
    above = np.where(np.abs(a) > threshold)[0]
    if above.size == 0:
        return pcm
    end = int(above[-1]) + int(tail_ms / 1000.0 * RATE)
    end = min(end, a.size)
    return a[:end].tobytes()


def save_wav(pcm: bytes, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with wave.open(str(path), "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(RATE)
        wf.writeframes(pcm)


def estimate_tts_duration(text: str) -> float:
    """Match RobotController's pacing estimate (0.12 s/char, min 2.5 s)."""
    return max(2.5, len(text) * 0.12)


# ── optional playback verification ──────────────────────────────────────────
def play_on_g1(ac, path: Path) -> None:
    with wave.open(str(path), "rb") as wf:
        pcm = wf.readframes(wf.getnframes())
    CHUNK = 96000
    sid = f"verify_{int(time.time() * 1000)}"
    offset = 0
    while offset < len(pcm):
        chunk = pcm[offset:offset + CHUNK]
        ac.PlayStream("saqr_verify", sid, chunk)
        offset += len(chunk)
        time.sleep(len(chunk) / (RATE * 2) / 2)
    time.sleep(len(pcm) / (RATE * 2) + 0.3)
    try:
        ac.PlayStop("saqr_verify")
    except Exception:
        pass


def countdown(seconds: int) -> None:
    for i in range(seconds, 0, -1):
        print(f"  starting in {i}...", end="\r", flush=True)
        time.sleep(1)
    print("  " + " " * 30, end="\r")


# ── main ────────────────────────────────────────────────────────────────────
def main():
    ap = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    ap.add_argument("--mic",         action="store_true",
                    help="record YOUR voice via the G1 mic instead of TtsMaker")
    ap.add_argument("--duration",    type=float, default=None,
                    help="seconds to record per phrase (default: estimate per phrase)")
    ap.add_argument("--pause",       type=float, default=1.0,
                    help="seconds of pause between phrases (default 1)")
    ap.add_argument("--only",        default=None,
                    help="comma-separated keys to record (e.g. 'safe,helmet,vest')")
    ap.add_argument("--iface",       default="eth0",
                    help="DDS iface for TtsMaker / PlayStream (default eth0)")
    ap.add_argument("--speaker-id",  type=int, default=2,
                    help="TtsMaker speaker_id (2 = English on current firmware)")
    ap.add_argument("--volume",      type=int, default=100,
                    help="G1 speaker volume 0–100 (default 100)")
    ap.add_argument("--play",        action="store_true",
                    help="play each saved clip back on the G1 via PlayStream")
    ap.add_argument("--countdown",   type=int, default=2,
                    help="countdown seconds before --mic recordings (default 2)")
    ap.add_argument("--no-trim",     action="store_true",
                    help="don't auto-trim leading/trailing silence")
    ap.add_argument("--threshold",   type=int, default=500,
                    help="amplitude threshold for silence trim (default 500)")
    args = ap.parse_args()

    only = set(args.only.split(",")) if args.only else None
    targets = [(c, k, t) for (c, k, t) in PHRASES if only is None or k in only]
    if not targets:
        print(f"No phrases match --only={args.only!r}. Known keys: "
              f"{', '.join(k for _, k, _ in PHRASES)}")
        sys.exit(1)

    try:
        local_ip = find_local_ip()
    except RuntimeError as e:
        print(f"[FATAL] {e}")
        sys.exit(1)

    mode = "mic" if args.mic else "tts"

    # Init AudioClient for TTS mode or --play.
    ac = None
    if mode == "tts" or args.play:
        try:
            from unitree_sdk2py.core.channel import ChannelFactoryInitialize
            from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
        except ImportError as e:
            print(f"[FATAL] unitree_sdk2py not available in this env: {e}")
            sys.exit(1)
        print(f"[init] ChannelFactoryInitialize(0, {args.iface!r})")
        ChannelFactoryInitialize(0, args.iface)
        ac = AudioClient()
        ac.SetTimeout(10.0)
        ac.Init()
        try:
            ac.SetVolume(args.volume)
        except Exception as e:
            print(f"[init][WARN] SetVolume failed: {e}")
        print(f"[init] AudioClient ready (speaker_id={args.speaker_id} volume={args.volume})")
        print()

    print("=" * 60)
    print(f" Saqr phrase recorder — {len(targets)} clip(s)")
    print(f" Mode:      {mode.upper()}  "
          f"({'TtsMaker → mic' if mode == 'tts' else 'your voice → mic'})")
    print(f" Duration:  {('auto (per phrase)' if args.duration is None else f'{args.duration}s')}")
    print(f" Pause:     {args.pause}s between phrases")
    print(f" Trim:      {'off' if args.no_trim else f'on (threshold={args.threshold})'}")
    print(f" Output:    {ASSETS_ROOT}")
    print(f" Mic feed:  {MCAST_GRP}:{MCAST_PORT} (local IP: {local_ip})")
    print("=" * 60)
    print()

    if mode == "mic":
        print("Stand within ~1 m of the G1. When you see '>>> SPEAK NOW <<<' say")
        print("the SAY: line clearly. Target rms 500–5000.")
    else:
        print("The G1 will speak each phrase via TtsMaker while the script records")
        print("through the mic. Stay quiet during capture; any room noise you hear")
        print("will end up baked into the clip.")
    print()
    try:
        input("Press Enter to start…")
    except EOFError:
        return
    print()

    saved = []
    for i, (cat, key, text) in enumerate(targets, 1):
        out = ASSETS_ROOT / cat / f"{key}.wav"
        print(f"── [{i}/{len(targets)}  {cat}/{key}] ──")
        print(f"   SAY: {text}")

        if mode == "mic":
            if args.countdown > 0:
                countdown(args.countdown)
            dur = args.duration if args.duration is not None else 5.0
            print(f"   >>> SPEAK NOW — {dur}s <<<", flush=True)
            pcm = record_multicast_fixed(dur)
        else:
            dur = args.duration if args.duration is not None else \
                  estimate_tts_duration(text) + 1.5
            print(f"   TtsMaker playing… (capturing {dur:.1f}s)", flush=True)
            pcm = record_while_tts_plays(ac, text, args.speaker_id, dur)

        if not pcm:
            print(f"   [FAIL] no audio captured")
            print()
            continue

        if not args.no_trim:
            pcm = trim_leading_silence(pcm, threshold=args.threshold)
            pcm = trim_trailing_silence(pcm, threshold=args.threshold)

        rms, peak = rms_and_peak(pcm)
        dur_s = len(pcm) / 2.0 / RATE
        save_wav(pcm, out)

        if rms < 400:
            marker = f"  ⚠ TOO QUIET — re-record with --only {key}"
        elif peak > 30000:
            marker = "  ⚠ CLIPPING"
        else:
            marker = "  ✓"
        print(f"   saved → {out.relative_to(PROJECT_ROOT)}  "
              f"({dur_s:.1f}s  rms={rms:.0f}  peak={peak}){marker}")
        saved.append((cat, key, out, dur_s, rms, peak))

        if args.play and ac is not None:
            print(f"   playing back…")
            try:
                play_on_g1(ac, out)
            except Exception as e:
                print(f"   [play] failed: {e}")

        if i < len(targets) and args.pause > 0:
            time.sleep(args.pause)
        print()

    # Summary
    print("=" * 60)
    print(f" Done — {len(saved)}/{len(targets)} clip(s) saved")
    print("=" * 60)
    bad = []
    for cat, key, path, dur, rms, peak in saved:
        warn = " ⚠ low level" if rms < 400 else (" ⚠ clipping" if peak > 30000 else "")
        if warn:
            bad.append(key)
        print(f"  {cat}/{key:20s}  {dur:4.1f}s  rms={rms:5.0f}  peak={peak:5d}{warn}")
    print()
    if bad:
        print(f"Re-record the flagged ones with:")
        print(f"  python3 scripts/record_phrases.py --only {','.join(bad)}")
        print()
    print("Next:")
    print("  1. (on robot) sudo systemctl restart saqr-bridge")
    print("  2. expect:   [audio_player] loaded N clip(s): ...")


if __name__ == "__main__":
    main()