121 lines
4.4 KiB
Python
121 lines
4.4 KiB
Python
"""
|
|
builtin_tts.py — Unitree G1 built-in TTS (English)
|
|
===================================================
|
|
Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
|
|
TTS engine synthesizes and plays directly through the body speaker — no
|
|
internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.
|
|
|
|
Speaker IDs — IMPORTANT:
|
|
The `speaker_id` argument is *not* a language selector; it's a voice
|
|
bank. On some G1 firmware revisions, speaker_id 0 produces a Mandarin
|
|
voice even for Latin-script input. If `Config/config_Voice.json::
|
|
tts.builtin_speaker_id` gives you Chinese output, try 1 or 2.
|
|
|
|
Quick probe on the robot:
|
|
python3 /home/unitree/Marcus/Voice/builtin_tts.py 0 1 2
|
|
(plays "Hello, I am Sanad" once per speaker_id so you can pick the
|
|
one that sounds like English, then set that in config_Voice.json).
|
|
|
|
Usage:
|
|
from Voice.builtin_tts import BuiltinTTS
|
|
tts = BuiltinTTS(audio_client, default_speaker_id=1)
|
|
tts.speak("Hello, I am Sanad")
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
|
|
log = logging.getLogger("builtin_tts")
|
|
|
|
|
|
class BuiltinTTS:
|
|
"""Synchronous English-only TTS via the G1's on-board engine."""
|
|
|
|
# Rough playback duration per character — enough margin that `speak()`
|
|
# returns after audio has actually finished on the robot.
|
|
SECONDS_PER_CHAR = 0.08
|
|
MIN_SECONDS = 1.5
|
|
|
|
def __init__(self, audio_client, default_speaker_id: int = 0):
|
|
"""
|
|
Args:
|
|
audio_client : initialized unitree_sdk2py AudioClient
|
|
default_speaker_id : 0, 1, or 2 (default voice timbre)
|
|
"""
|
|
self._client = audio_client
|
|
self._default_speaker = default_speaker_id
|
|
|
|
def speak(
|
|
self,
|
|
text: str,
|
|
speaker_id: Optional[int] = None,
|
|
block: bool = True,
|
|
) -> int:
|
|
"""
|
|
Play `text` on the G1 speaker via TtsMaker.
|
|
|
|
English-only by policy. Non-ASCII (Arabic) input is rejected rather
|
|
than silently played back as Chinese. Returns the TtsMaker status
|
|
code (0 = success) or -1 if input was rejected.
|
|
"""
|
|
if not text or not text.strip():
|
|
return -1
|
|
|
|
# Reject non-English. TtsMaker "falls back" by playing Arabic text
|
|
# as Chinese phonemes — intelligible to nobody — so we refuse it
|
|
# rather than surprise the operator.
|
|
if any(ord(c) > 127 for c in text):
|
|
log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
|
|
return -1
|
|
|
|
sid = self._default_speaker if speaker_id is None else speaker_id
|
|
log.info("[TtsMaker sid=%d] %s", sid, text[:80])
|
|
|
|
try:
|
|
code = self._client.TtsMaker(text, sid)
|
|
except Exception as e:
|
|
log.error("TtsMaker call failed: %s", e)
|
|
return -1
|
|
|
|
if block:
|
|
# Estimate how long the G1 is going to take to finish speaking.
|
|
# TtsMaker is fire-and-forget — we need to wait so the mic loop
|
|
# knows when to unmute.
|
|
duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
|
|
time.sleep(duration)
|
|
|
|
return code
|
|
|
|
|
|
# ─── STANDALONE PROBE ────────────────────────────────────────────────
|
|
# Usage on the robot (in the marcus conda env):
|
|
# python3 Voice/builtin_tts.py # plays all 3 speaker IDs
|
|
# python3 Voice/builtin_tts.py 1 2 # plays only IDs 1 and 2
|
|
# Pick whichever ID sounds like English and set it in
|
|
# Config/config_Voice.json :: tts.builtin_speaker_id
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
|
|
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
|
|
|
|
ChannelFactoryInitialize(0, "eth0")
|
|
ac = AudioClient()
|
|
ac.SetTimeout(10.0)
|
|
ac.Init()
|
|
ac.SetVolume(100)
|
|
|
|
ids = [int(x) for x in sys.argv[1:]] if len(sys.argv) > 1 else [0, 1, 2]
|
|
phrase = "Hello, I am Sanad."
|
|
print(f"\nProbing TtsMaker with text: {phrase!r}\n")
|
|
tts = BuiltinTTS(ac)
|
|
for sid in ids:
|
|
print(f" → speaker_id = {sid}")
|
|
tts.speak(phrase, speaker_id=sid, block=True)
|
|
time.sleep(0.5)
|
|
print("\nDone. Pick the speaker_id that sounded like English and put it in")
|
|
print(" Config/config_Voice.json :: tts.builtin_speaker_id")
|