Marcus/Voice/builtin_tts.py

"""
builtin_tts.py — Unitree G1 built-in TTS (English only)
========================================================
Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
TTS engine synthesizes and plays directly through the body speaker — no
internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.

Supported languages (firmware-side):
    English  — works  (Marcus uses this)
    Chinese  — works  (unused)
    Arabic   — silently falls back to Chinese (unusable — we refuse these)

Signature:
    client.TtsMaker(text: str, speaker_id: int) -> int    # 0 = success
    speaker_id ∈ {0, 1, 2}   — different voice timbres

Usage:
    from Voice.builtin_tts import BuiltinTTS
    tts = BuiltinTTS(audio_client)
    tts.speak("Hello, I am Sanad", speaker_id=0)
"""

from __future__ import annotations

import logging
import time
from typing import Optional

log = logging.getLogger("builtin_tts")


class BuiltinTTS:
    """Synchronous English-only TTS via the G1's on-board engine."""

    # Rough playback duration per character — enough margin that `speak()`
    # returns after audio has actually finished on the robot.
    SECONDS_PER_CHAR = 0.08
    MIN_SECONDS      = 1.5

    def __init__(self, audio_client, default_speaker_id: int = 0):
        """
        Args:
            audio_client       : initialized unitree_sdk2py AudioClient
            default_speaker_id : 0, 1, or 2 (default voice timbre)
        """
        self._client = audio_client
        self._default_speaker = default_speaker_id

    def speak(
        self,
        text: str,
        speaker_id: Optional[int] = None,
        block: bool = True,
    ) -> int:
        """
        Play `text` on the G1 speaker via TtsMaker.

        English-only by policy. Non-ASCII (Arabic) input is rejected rather
        than silently played back as Chinese. Returns the TtsMaker status
        code (0 = success) or -1 if input was rejected.
        """
        if not text or not text.strip():
            return -1

        # Reject non-English. TtsMaker "falls back" by playing Arabic text
        # as Chinese phonemes — intelligible to nobody — so we refuse it
        # rather than surprise the operator.
        if any(ord(c) > 127 for c in text):
            log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
            return -1

        sid = self._default_speaker if speaker_id is None else speaker_id
        log.info("[TtsMaker sid=%d] %s", sid, text[:80])

        try:
            code = self._client.TtsMaker(text, sid)
        except Exception as e:
            log.error("TtsMaker call failed: %s", e)
            return -1

        if block:
            # Estimate how long the G1 is going to take to finish speaking.
            # TtsMaker is fire-and-forget — we need to wait so the mic loop
            # knows when to unmute.
            duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
            time.sleep(duration)

        return code