Saqr/scripts/generate_phrases.py

179 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
generate_phrases.py — synthesize Saqr's 8 phrases to WAV using piper-tts.
This produces the audio library locally (no G1 mic, no TtsMaker capture,
no PulseAudio) at exactly the format robot.audio_player expects:
16 kHz mono int16 WAV under assets/audio/<category>/<key>.wav.
Setup (once):
pip install piper-tts
python -m piper.download_voices en_US-amy-medium
Usage:
python scripts/generate_phrases.py
python scripts/generate_phrases.py --voice en_US-lessac-medium
python scripts/generate_phrases.py --only safe,helmet_vest
python scripts/generate_phrases.py --voice-dir /custom/path
Common voices (run the download command above, swap the name):
en_US-amy-medium female, neutral (~60 MB)
en_US-lessac-medium female, friendly (~60 MB)
en_US-ryan-high male, clear (~120 MB)
en_GB-alan-medium male, British (~60 MB)
"""
from __future__ import annotations
import argparse
import io
import sys
import wave
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parent.parent
ASSETS_ROOT = PROJECT_ROOT / "assets" / "audio"
PHRASES = [
("fixed", "ready",
"Saqr is running. Press R2 plus X to start."),
("fixed", "deactivated",
"Saqr deactivated."),
("fixed", "no_camera",
"Camera not connected. Please plug in the camera and try again."),
("fixed", "safe",
"Safe to enter. Have a good day."),
("fixed", "unsafe_generic",
"Please stop. Wear your proper safety equipment."),
("unsafe_missing", "helmet",
"Please stop. Wear your proper safety equipment. You are missing helmet."),
("unsafe_missing", "vest",
"Please stop. Wear your proper safety equipment. You are missing vest."),
("unsafe_missing", "helmet_vest",
"Please stop. Wear your proper safety equipment. You are missing helmet and vest."),
]
def find_voice_files(voice_name: str, override_dir: Path = None):
"""Locate <voice_name>.onnx + .onnx.json in piper's standard dirs."""
search_dirs = []
if override_dir is not None:
search_dirs.append(Path(override_dir))
search_dirs += [
Path.home() / ".local" / "share" / "piper-voices",
Path.home() / ".cache" / "piper" / "voices",
Path.cwd(),
]
for d in search_dirs:
onnx = d / f"{voice_name}.onnx"
js = d / f"{voice_name}.onnx.json"
if onnx.exists() and js.exists():
return onnx, js
return None, None
def resample_to_16k_mono_int16(pcm: bytes, src_rate: int, src_channels: int) -> bytes:
"""Linear-resample int16 PCM to 16 kHz mono."""
import numpy as np
a = np.frombuffer(pcm, dtype=np.int16)
if src_channels > 1:
a = a.reshape(-1, src_channels).mean(axis=1).astype(np.int16)
if src_rate == 16000:
return a.tobytes()
target_len = int(round(len(a) * 16000 / src_rate))
idx = np.linspace(0, len(a) - 1, target_len).astype(np.int64)
return a[idx].astype(np.int16).tobytes()
def save_wav_16k_mono(pcm: bytes, path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with wave.open(str(path), "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(16000)
wf.writeframes(pcm)
def main():
ap = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter,
)
ap.add_argument("--voice", default="en_US-amy-medium",
help="piper voice name (default en_US-amy-medium)")
ap.add_argument("--voice-dir", default=None,
help="override directory to search for the voice .onnx files")
ap.add_argument("--only", default=None,
help="comma-separated keys to generate (e.g. safe,helmet)")
ap.add_argument("--length-scale", type=float, default=None,
help="speaking rate (piper default ~1.0; 0.8 faster, 1.2 slower)")
args = ap.parse_args()
only = set(args.only.split(",")) if args.only else None
targets = [(c, k, t) for (c, k, t) in PHRASES if only is None or k in only]
if not targets:
print(f"No phrases match --only={args.only!r}.")
sys.exit(1)
# Import piper
try:
from piper.voice import PiperVoice
except ImportError:
print("ERROR: piper-tts is not installed in this Python environment.")
print()
print("Install it:")
print(" pip install piper-tts")
print(f" python -m piper.download_voices {args.voice}")
sys.exit(1)
onnx, js = find_voice_files(args.voice, args.voice_dir)
if onnx is None:
print(f"ERROR: voice files for {args.voice!r} not found.")
print()
print("Download:")
print(f" python -m piper.download_voices {args.voice}")
print()
print("Or pass --voice-dir pointing at a directory that contains")
print(f" {args.voice}.onnx + {args.voice}.onnx.json")
sys.exit(1)
print(f"Loading voice: {onnx}")
voice = PiperVoice.load(str(onnx), config_path=str(js))
src_rate = int(voice.config.sample_rate)
print(f"Native rate: {src_rate} Hz")
print(f"Target: 16000 Hz mono int16 under {ASSETS_ROOT}")
print()
synth_kwargs = {}
if args.length_scale is not None:
synth_kwargs["length_scale"] = args.length_scale
for cat, key, text in targets:
out = ASSETS_ROOT / cat / f"{key}.wav"
print(f"[{cat}/{key}]")
print(f" text : {text}")
# Synthesize into an in-memory WAV buffer.
buf = io.BytesIO()
with wave.open(buf, "wb") as wf:
voice.synthesize(text, wf, **synth_kwargs)
buf.seek(0)
with wave.open(buf, "rb") as wf:
pcm = wf.readframes(wf.getnframes())
rate_in = wf.getframerate()
channels = wf.getnchannels()
resampled = resample_to_16k_mono_int16(pcm, rate_in, channels)
save_wav_16k_mono(resampled, out)
dur = len(resampled) / 2.0 / 16000
print(f" saved → {out.relative_to(PROJECT_ROOT)} ({dur:.1f}s)")
print()
print("Done. Deploy + restart:")
print(" scripts/deploy.sh")
print(" ssh unitree@192.168.123.164 'sudo systemctl restart saqr-bridge && "
"journalctl -u saqr-bridge -n 20 | grep audio_player'")
if __name__ == "__main__":
main()