#!/usr/bin/env python3 """ generate_phrases.py — synthesize Saqr's 8 phrases to WAV using piper-tts. This produces the audio library locally (no G1 mic, no TtsMaker capture, no PulseAudio) at exactly the format robot.audio_player expects: 16 kHz mono int16 WAV under assets/audio//.wav. Setup (once): pip install piper-tts python -m piper.download_voices en_US-amy-medium Usage: python scripts/generate_phrases.py python scripts/generate_phrases.py --voice en_US-lessac-medium python scripts/generate_phrases.py --only safe,helmet_vest python scripts/generate_phrases.py --voice-dir /custom/path Common voices (run the download command above, swap the name): en_US-amy-medium female, neutral (~60 MB) en_US-lessac-medium female, friendly (~60 MB) en_US-ryan-high male, clear (~120 MB) en_GB-alan-medium male, British (~60 MB) """ from __future__ import annotations import argparse import io import sys import wave from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent.parent ASSETS_ROOT = PROJECT_ROOT / "assets" / "audio" PHRASES = [ ("fixed", "ready", "Saqr is running. Press R2 plus X to start."), ("fixed", "deactivated", "Saqr deactivated."), ("fixed", "no_camera", "Camera not connected. Please plug in the camera and try again."), ("fixed", "safe", "Safe to enter. Have a good day."), ("fixed", "unsafe_generic", "Please stop. Wear your proper safety equipment."), ("unsafe_missing", "helmet", "Please stop. Wear your proper safety equipment. You are missing helmet."), ("unsafe_missing", "vest", "Please stop. Wear your proper safety equipment. You are missing vest."), ("unsafe_missing", "helmet_vest", "Please stop. Wear your proper safety equipment. You are missing helmet and vest."), ] def find_voice_files(voice_name: str, override_dir: Path = None): """Locate .onnx + .onnx.json in piper's standard dirs.""" search_dirs = [] if override_dir is not None: search_dirs.append(Path(override_dir)) search_dirs += [ Path.home() / ".local" / "share" / "piper-voices", Path.home() / ".cache" / "piper" / "voices", Path.cwd(), ] for d in search_dirs: onnx = d / f"{voice_name}.onnx" js = d / f"{voice_name}.onnx.json" if onnx.exists() and js.exists(): return onnx, js return None, None def resample_to_16k_mono_int16(pcm: bytes, src_rate: int, src_channels: int) -> bytes: """Linear-resample int16 PCM to 16 kHz mono.""" import numpy as np a = np.frombuffer(pcm, dtype=np.int16) if src_channels > 1: a = a.reshape(-1, src_channels).mean(axis=1).astype(np.int16) if src_rate == 16000: return a.tobytes() target_len = int(round(len(a) * 16000 / src_rate)) idx = np.linspace(0, len(a) - 1, target_len).astype(np.int64) return a[idx].astype(np.int16).tobytes() def save_wav_16k_mono(pcm: bytes, path: Path) -> None: path.parent.mkdir(parents=True, exist_ok=True) with wave.open(str(path), "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(16000) wf.writeframes(pcm) def main(): ap = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, ) ap.add_argument("--voice", default="en_US-amy-medium", help="piper voice name (default en_US-amy-medium)") ap.add_argument("--voice-dir", default=None, help="override directory to search for the voice .onnx files") ap.add_argument("--only", default=None, help="comma-separated keys to generate (e.g. safe,helmet)") ap.add_argument("--length-scale", type=float, default=None, help="speaking rate (piper default ~1.0; 0.8 faster, 1.2 slower)") args = ap.parse_args() only = set(args.only.split(",")) if args.only else None targets = [(c, k, t) for (c, k, t) in PHRASES if only is None or k in only] if not targets: print(f"No phrases match --only={args.only!r}.") sys.exit(1) # Import piper try: from piper.voice import PiperVoice except ImportError: print("ERROR: piper-tts is not installed in this Python environment.") print() print("Install it:") print(" pip install piper-tts") print(f" python -m piper.download_voices {args.voice}") sys.exit(1) onnx, js = find_voice_files(args.voice, args.voice_dir) if onnx is None: print(f"ERROR: voice files for {args.voice!r} not found.") print() print("Download:") print(f" python -m piper.download_voices {args.voice}") print() print("Or pass --voice-dir pointing at a directory that contains") print(f" {args.voice}.onnx + {args.voice}.onnx.json") sys.exit(1) print(f"Loading voice: {onnx}") voice = PiperVoice.load(str(onnx), config_path=str(js)) src_rate = int(voice.config.sample_rate) print(f"Native rate: {src_rate} Hz") print(f"Target: 16000 Hz mono int16 under {ASSETS_ROOT}") print() synth_kwargs = {} if args.length_scale is not None: synth_kwargs["length_scale"] = args.length_scale for cat, key, text in targets: out = ASSETS_ROOT / cat / f"{key}.wav" print(f"[{cat}/{key}]") print(f" text : {text}") # Synthesize into an in-memory WAV buffer. buf = io.BytesIO() with wave.open(buf, "wb") as wf: voice.synthesize(text, wf, **synth_kwargs) buf.seek(0) with wave.open(buf, "rb") as wf: pcm = wf.readframes(wf.getnframes()) rate_in = wf.getframerate() channels = wf.getnchannels() resampled = resample_to_16k_mono_int16(pcm, rate_in, channels) save_wav_16k_mono(resampled, out) dur = len(resampled) / 2.0 / 16000 print(f" saved → {out.relative_to(PROJECT_ROOT)} ({dur:.1f}s)") print() print("Done. Deploy + restart:") print(" scripts/deploy.sh") print(" ssh unitree@192.168.123.164 'sudo systemctl restart saqr-bridge && " "journalctl -u saqr-bridge -n 20 | grep audio_player'") if __name__ == "__main__": main()