Marcus/Voice/builtin_tts.py

"""
builtin_tts.py — Unitree G1 built-in TTS (English)
===================================================
Thin wrapper around AudioClient.TtsMaker(text, speaker_id). The G1's on-board
TTS engine synthesizes and plays directly through the body speaker — no
internet, no MP3/WAV roundtrip, no audio SDK plumbing on our side.

Speaker IDs — IMPORTANT:
    The `speaker_id` argument is *not* a language selector; it's a voice
    bank. On some G1 firmware revisions, speaker_id 0 produces a Mandarin
    voice even for Latin-script input. If `Config/config_Voice.json::
    tts.builtin_speaker_id` gives you Chinese output, try 1 or 2.

    Quick probe on the robot:
        python3 /home/unitree/Marcus/Voice/builtin_tts.py 0 1 2
    (plays "Hello, I am Sanad" once per speaker_id so you can pick the
    one that sounds like English, then set that in config_Voice.json).

Usage:
    from Voice.builtin_tts import BuiltinTTS
    tts = BuiltinTTS(audio_client, default_speaker_id=1)
    tts.speak("Hello, I am Sanad")
"""

from __future__ import annotations

import logging
import time
from typing import Optional

log = logging.getLogger("builtin_tts")


class BuiltinTTS:
    """Synchronous English-only TTS via the G1's on-board engine."""

    # Rough playback duration per character — enough margin that `speak()`
    # returns after audio has actually finished on the robot.
    SECONDS_PER_CHAR = 0.08
    MIN_SECONDS      = 1.5

    def __init__(self, audio_client, default_speaker_id: int = 0):
        """
        Args:
            audio_client       : initialized unitree_sdk2py AudioClient
            default_speaker_id : 0, 1, or 2 (default voice timbre)
        """
        self._client = audio_client
        self._default_speaker = default_speaker_id

    def speak(
        self,
        text: str,
        speaker_id: Optional[int] = None,
        block: bool = True,
    ) -> int:
        """
        Play `text` on the G1 speaker via TtsMaker.

        English-only by policy. Non-ASCII (Arabic) input is rejected rather
        than silently played back as Chinese. Returns the TtsMaker status
        code (0 = success) or -1 if input was rejected.
        """
        if not text or not text.strip():
            return -1

        # Reject non-English. TtsMaker "falls back" by playing Arabic text
        # as Chinese phonemes — intelligible to nobody — so we refuse it
        # rather than surprise the operator.
        if any(ord(c) > 127 for c in text):
            log.warning("builtin_tts refusing non-ASCII text: %r", text[:60])
            return -1

        sid = self._default_speaker if speaker_id is None else speaker_id
        log.info("[TtsMaker sid=%d] %s", sid, text[:80])

        try:
            code = self._client.TtsMaker(text, sid)
        except Exception as e:
            log.error("TtsMaker call failed: %s", e)
            return -1

        if block:
            # Estimate how long the G1 is going to take to finish speaking.
            # TtsMaker is fire-and-forget — we need to wait so the mic loop
            # knows when to unmute.
            duration = max(self.MIN_SECONDS, len(text) * self.SECONDS_PER_CHAR)
            time.sleep(duration)

        return code


# ─── STANDALONE PROBE ────────────────────────────────────────────────
# Usage on the robot (in the marcus conda env):
#     python3 Voice/builtin_tts.py              # plays all 3 speaker IDs
#     python3 Voice/builtin_tts.py 1 2          # plays only IDs 1 and 2
# Pick whichever ID sounds like English and set it in
#     Config/config_Voice.json :: tts.builtin_speaker_id

if __name__ == "__main__":
    import sys
    from unitree_sdk2py.core.channel import ChannelFactoryInitialize
    from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient

    ChannelFactoryInitialize(0, "eth0")
    ac = AudioClient()
    ac.SetTimeout(10.0)
    ac.Init()
    ac.SetVolume(100)

    ids = [int(x) for x in sys.argv[1:]] if len(sys.argv) > 1 else [0, 1, 2]
    phrase = "Hello, I am Sanad."
    print(f"\nProbing TtsMaker with text: {phrase!r}\n")
    tts = BuiltinTTS(ac)
    for sid in ids:
        print(f"  → speaker_id = {sid}")
        tts.speak(phrase, speaker_id=sid, block=True)
        time.sleep(0.5)
    print("\nDone. Pick the speaker_id that sounded like English and put it in")
    print("   Config/config_Voice.json :: tts.builtin_speaker_id")