89 lines
3.0 KiB
Python
89 lines
3.0 KiB
Python
"""
|
|
builtin_tts.py — Unitree G1 built-in TTS (English only)
|
|
========================================================
|
|
Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
|
|
TTS engine synthesizes and plays directly through the body speaker — no
|
|
internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.
|
|
|
|
Supported languages (firmware-side):
|
|
English — works (Marcus uses this)
|
|
Chinese — works (unused)
|
|
Arabic — silently falls back to Chinese (unusable — we refuse these)
|
|
|
|
Signature:
|
|
client.TtsMaker(text: str, speaker_id: int) -> int # 0 = success
|
|
speaker_id ∈ {0, 1, 2} — different voice timbres
|
|
|
|
Usage:
|
|
from Voice.builtin_tts import BuiltinTTS
|
|
tts = BuiltinTTS(audio_client)
|
|
tts.speak("Hello, I am Sanad", speaker_id=0)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import time
|
|
from typing import Optional
|
|
|
|
log = logging.getLogger("builtin_tts")
|
|
|
|
|
|
class BuiltinTTS:
|
|
"""Synchronous English-only TTS via the G1's on-board engine."""
|
|
|
|
# Rough playback duration per character — enough margin that `speak()`
|
|
# returns after audio has actually finished on the robot.
|
|
SECONDS_PER_CHAR = 0.08
|
|
MIN_SECONDS = 1.5
|
|
|
|
def __init__(self, audio_client, default_speaker_id: int = 0):
|
|
"""
|
|
Args:
|
|
audio_client : initialized unitree_sdk2py AudioClient
|
|
default_speaker_id : 0, 1, or 2 (default voice timbre)
|
|
"""
|
|
self._client = audio_client
|
|
self._default_speaker = default_speaker_id
|
|
|
|
def speak(
|
|
self,
|
|
text: str,
|
|
speaker_id: Optional[int] = None,
|
|
block: bool = True,
|
|
) -> int:
|
|
"""
|
|
Play `text` on the G1 speaker via TtsMaker.
|
|
|
|
English-only by policy. Non-ASCII (Arabic) input is rejected rather
|
|
than silently played back as Chinese. Returns the TtsMaker status
|
|
code (0 = success) or -1 if input was rejected.
|
|
"""
|
|
if not text or not text.strip():
|
|
return -1
|
|
|
|
# Reject non-English. TtsMaker "falls back" by playing Arabic text
|
|
# as Chinese phonemes — intelligible to nobody — so we refuse it
|
|
# rather than surprise the operator.
|
|
if any(ord(c) > 127 for c in text):
|
|
log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
|
|
return -1
|
|
|
|
sid = self._default_speaker if speaker_id is None else speaker_id
|
|
log.info("[TtsMaker sid=%d] %s", sid, text[:80])
|
|
|
|
try:
|
|
code = self._client.TtsMaker(text, sid)
|
|
except Exception as e:
|
|
log.error("TtsMaker call failed: %s", e)
|
|
return -1
|
|
|
|
if block:
|
|
# Estimate how long the G1 is going to take to finish speaking.
|
|
# TtsMaker is fire-and-forget — we need to wait so the mic loop
|
|
# knows when to unmute.
|
|
duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
|
|
time.sleep(duration)
|
|
|
|
return code
|