Saqr/scripts/generate_phrases.py

#!/usr/bin/env python3
"""
generate_phrases.py — synthesize Saqr's 8 phrases to WAV using piper-tts.

This produces the audio library locally (no G1 mic, no TtsMaker capture,
no PulseAudio) at exactly the format robot.audio_player expects:
16 kHz mono int16 WAV under assets/audio/<category>/<key>.wav.

Setup (once):
    pip install piper-tts
    python -m piper.download_voices en_US-amy-medium

Usage:
    python scripts/generate_phrases.py
    python scripts/generate_phrases.py --voice en_US-lessac-medium
    python scripts/generate_phrases.py --only safe,helmet_vest
    python scripts/generate_phrases.py --voice-dir /custom/path

Common voices (run the download command above, swap the name):
    en_US-amy-medium      female, neutral         (~60 MB)
    en_US-lessac-medium   female, friendly        (~60 MB)
    en_US-ryan-high       male, clear             (~120 MB)
    en_GB-alan-medium     male, British           (~60 MB)
"""
from __future__ import annotations

import argparse
import io
import sys
import wave
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parent.parent
ASSETS_ROOT  = PROJECT_ROOT / "assets" / "audio"

PHRASES = [
    ("fixed",          "ready",
     "Saqr is running. Press R2 plus X to start."),
    ("fixed",          "deactivated",
     "Saqr deactivated."),
    ("fixed",          "no_camera",
     "Camera not connected. Please plug in the camera and try again."),
    ("fixed",          "safe",
     "Safe to enter. Have a good day."),
    ("fixed",          "unsafe_generic",
     "Please stop. Wear your proper safety equipment."),
    ("unsafe_missing", "helmet",
     "Please stop. Wear your proper safety equipment. You are missing helmet."),
    ("unsafe_missing", "vest",
     "Please stop. Wear your proper safety equipment. You are missing vest."),
    ("unsafe_missing", "helmet_vest",
     "Please stop. Wear your proper safety equipment. You are missing helmet and vest."),
]


def find_voice_files(voice_name: str, override_dir: Path = None):
    """Locate <voice_name>.onnx + .onnx.json in piper's standard dirs."""
    search_dirs = []
    if override_dir is not None:
        search_dirs.append(Path(override_dir))
    search_dirs += [
        Path.home() / ".local" / "share" / "piper-voices",
        Path.home() / ".cache" / "piper" / "voices",
        Path.cwd(),
    ]
    for d in search_dirs:
        onnx = d / f"{voice_name}.onnx"
        js   = d / f"{voice_name}.onnx.json"
        if onnx.exists() and js.exists():
            return onnx, js
    return None, None


def resample_to_16k_mono_int16(pcm: bytes, src_rate: int, src_channels: int) -> bytes:
    """Linear-resample int16 PCM to 16 kHz mono."""
    import numpy as np
    a = np.frombuffer(pcm, dtype=np.int16)
    if src_channels > 1:
        a = a.reshape(-1, src_channels).mean(axis=1).astype(np.int16)
    if src_rate == 16000:
        return a.tobytes()
    target_len = int(round(len(a) * 16000 / src_rate))
    idx = np.linspace(0, len(a) - 1, target_len).astype(np.int64)
    return a[idx].astype(np.int16).tobytes()


def save_wav_16k_mono(pcm: bytes, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with wave.open(str(path), "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(16000)
        wf.writeframes(pcm)


def main():
    ap = argparse.ArgumentParser(
        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    ap.add_argument("--voice",      default="en_US-amy-medium",
                    help="piper voice name (default en_US-amy-medium)")
    ap.add_argument("--voice-dir",  default=None,
                    help="override directory to search for the voice .onnx files")
    ap.add_argument("--only",       default=None,
                    help="comma-separated keys to generate (e.g. safe,helmet)")
    ap.add_argument("--length-scale", type=float, default=None,
                    help="speaking rate (piper default ~1.0; 0.8 faster, 1.2 slower)")
    args = ap.parse_args()

    only = set(args.only.split(",")) if args.only else None
    targets = [(c, k, t) for (c, k, t) in PHRASES if only is None or k in only]
    if not targets:
        print(f"No phrases match --only={args.only!r}.")
        sys.exit(1)

    # Import piper
    try:
        from piper.voice import PiperVoice
    except ImportError:
        print("ERROR: piper-tts is not installed in this Python environment.")
        print()
        print("Install it:")
        print("  pip install piper-tts")
        print(f"  python -m piper.download_voices {args.voice}")
        sys.exit(1)

    onnx, js = find_voice_files(args.voice, args.voice_dir)
    if onnx is None:
        print(f"ERROR: voice files for {args.voice!r} not found.")
        print()
        print("Download:")
        print(f"  python -m piper.download_voices {args.voice}")
        print()
        print("Or pass --voice-dir pointing at a directory that contains")
        print(f"  {args.voice}.onnx + {args.voice}.onnx.json")
        sys.exit(1)

    print(f"Loading voice: {onnx}")
    voice = PiperVoice.load(str(onnx), config_path=str(js))
    src_rate = int(voice.config.sample_rate)
    print(f"Native rate: {src_rate} Hz")
    print(f"Target:      16000 Hz mono int16 under {ASSETS_ROOT}")
    print()

    synth_kwargs = {}
    if args.length_scale is not None:
        synth_kwargs["length_scale"] = args.length_scale

    for cat, key, text in targets:
        out = ASSETS_ROOT / cat / f"{key}.wav"
        print(f"[{cat}/{key}]")
        print(f"   text : {text}")

        # Synthesize into an in-memory WAV buffer.
        buf = io.BytesIO()
        with wave.open(buf, "wb") as wf:
            voice.synthesize(text, wf, **synth_kwargs)
        buf.seek(0)
        with wave.open(buf, "rb") as wf:
            pcm       = wf.readframes(wf.getnframes())
            rate_in   = wf.getframerate()
            channels  = wf.getnchannels()

        resampled = resample_to_16k_mono_int16(pcm, rate_in, channels)
        save_wav_16k_mono(resampled, out)

        dur = len(resampled) / 2.0 / 16000
        print(f"   saved → {out.relative_to(PROJECT_ROOT)}   ({dur:.1f}s)")
        print()

    print("Done. Deploy + restart:")
    print("   scripts/deploy.sh")
    print("   ssh unitree@192.168.123.164 'sudo systemctl restart saqr-bridge && "
          "journalctl -u saqr-bridge -n 20 | grep audio_player'")


if __name__ == "__main__":
    main()