Marcus/Voice/builtin_tts.py

121 lines
4.4 KiB
Python

"""
builtin_tts.py — Unitree G1 built-in TTS (English)
===================================================
Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
TTS engine synthesizes and plays directly through the body speaker — no
internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.
Speaker IDs — IMPORTANT:
The `speaker_id` argument is *not* a language selector; it's a voice
bank. On some G1 firmware revisions, speaker_id 0 produces a Mandarin
voice even for Latin-script input. If `Config/config_Voice.json::
tts.builtin_speaker_id` gives you Chinese output, try 1 or 2.
Quick probe on the robot:
python3 /home/unitree/Marcus/Voice/builtin_tts.py 0 1 2
(plays "Hello, I am Sanad" once per speaker_id so you can pick the
one that sounds like English, then set that in config_Voice.json).
Usage:
from Voice.builtin_tts import BuiltinTTS
tts = BuiltinTTS(audio_client, default_speaker_id=1)
tts.speak("Hello, I am Sanad")
"""
from __future__ import annotations
import logging
import time
from typing import Optional
log = logging.getLogger("builtin_tts")
class BuiltinTTS:
"""Synchronous English-only TTS via the G1's on-board engine."""
# Rough playback duration per character — enough margin that `speak()`
# returns after audio has actually finished on the robot.
SECONDS_PER_CHAR = 0.08
MIN_SECONDS = 1.5
def __init__(self, audio_client, default_speaker_id: int = 0):
"""
Args:
audio_client : initialized unitree_sdk2py AudioClient
default_speaker_id : 0, 1, or 2 (default voice timbre)
"""
self._client = audio_client
self._default_speaker = default_speaker_id
def speak(
self,
text: str,
speaker_id: Optional[int] = None,
block: bool = True,
) -> int:
"""
Play `text` on the G1 speaker via TtsMaker.
English-only by policy. Non-ASCII (Arabic) input is rejected rather
than silently played back as Chinese. Returns the TtsMaker status
code (0 = success) or -1 if input was rejected.
"""
if not text or not text.strip():
return -1
# Reject non-English. TtsMaker "falls back" by playing Arabic text
# as Chinese phonemes — intelligible to nobody — so we refuse it
# rather than surprise the operator.
if any(ord(c) > 127 for c in text):
log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
return -1
sid = self._default_speaker if speaker_id is None else speaker_id
log.info("[TtsMaker sid=%d] %s", sid, text[:80])
try:
code = self._client.TtsMaker(text, sid)
except Exception as e:
log.error("TtsMaker call failed: %s", e)
return -1
if block:
# Estimate how long the G1 is going to take to finish speaking.
# TtsMaker is fire-and-forget — we need to wait so the mic loop
# knows when to unmute.
duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
time.sleep(duration)
return code
# ─── STANDALONE PROBE ────────────────────────────────────────────────
# Usage on the robot (in the marcus conda env):
# python3 Voice/builtin_tts.py # plays all 3 speaker IDs
# python3 Voice/builtin_tts.py 1 2 # plays only IDs 1 and 2
# Pick whichever ID sounds like English and set it in
# Config/config_Voice.json :: tts.builtin_speaker_id
if __name__ == "__main__":
import sys
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
ChannelFactoryInitialize(0, "eth0")
ac = AudioClient()
ac.SetTimeout(10.0)
ac.Init()
ac.SetVolume(100)
ids = [int(x) for x in sys.argv[1:]] if len(sys.argv) > 1 else [0, 1, 2]
phrase = "Hello, I am Sanad."
print(f"\nProbing TtsMaker with text: {phrase!r}\n")
tts = BuiltinTTS(ac)
for sid in ids:
print(f" → speaker_id = {sid}")
tts.speak(phrase, speaker_id=sid, block=True)
time.sleep(0.5)
print("\nDone. Pick the speaker_id that sounded like English and put it in")
print(" Config/config_Voice.json :: tts.builtin_speaker_id")