Marcus/Voice/builtin_tts.py

89 lines
3.0 KiB
Python

"""
builtin_tts.py — Unitree G1 built-in TTS (English only)
========================================================
Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
TTS engine synthesizes and plays directly through the body speaker — no
internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.
Supported languages (firmware-side):
English — works (Marcus uses this)
Chinese — works (unused)
Arabic — silently falls back to Chinese (unusable — we refuse these)
Signature:
client.TtsMaker(text: str, speaker_id: int) -> int # 0 = success
speaker_id ∈ {0, 1, 2} — different voice timbres
Usage:
from Voice.builtin_tts import BuiltinTTS
tts = BuiltinTTS(audio_client)
tts.speak("Hello, I am Sanad", speaker_id=0)
"""
from __future__ import annotations
import logging
import time
from typing import Optional
log = logging.getLogger("builtin_tts")
class BuiltinTTS:
"""Synchronous English-only TTS via the G1's on-board engine."""
# Rough playback duration per character — enough margin that `speak()`
# returns after audio has actually finished on the robot.
SECONDS_PER_CHAR = 0.08
MIN_SECONDS = 1.5
def __init__(self, audio_client, default_speaker_id: int = 0):
"""
Args:
audio_client : initialized unitree_sdk2py AudioClient
default_speaker_id : 0, 1, or 2 (default voice timbre)
"""
self._client = audio_client
self._default_speaker = default_speaker_id
def speak(
self,
text: str,
speaker_id: Optional[int] = None,
block: bool = True,
) -> int:
"""
Play `text` on the G1 speaker via TtsMaker.
English-only by policy. Non-ASCII (Arabic) input is rejected rather
than silently played back as Chinese. Returns the TtsMaker status
code (0 = success) or -1 if input was rejected.
"""
if not text or not text.strip():
return -1
# Reject non-English. TtsMaker "falls back" by playing Arabic text
# as Chinese phonemes — intelligible to nobody — so we refuse it
# rather than surprise the operator.
if any(ord(c) > 127 for c in text):
log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
return -1
sid = self._default_speaker if speaker_id is None else speaker_id
log.info("[TtsMaker sid=%d] %s", sid, text[:80])
try:
code = self._client.TtsMaker(text, sid)
except Exception as e:
log.error("TtsMaker call failed: %s", e)
return -1
if block:
# Estimate how long the G1 is going to take to finish speaking.
# TtsMaker is fire-and-forget — we need to wait so the mic loop
# knows when to unmute.
duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
time.sleep(duration)
return code