Sanad/local/tts.py

"""CosyVoice2 0.5B streaming TTS — GPU.

Phase 4 of the local pipeline. Holds a 3-second reference WAV in VRAM
and synthesises streaming Arabic/English audio for every text chunk
arriving from the LLM. Emits int16 PCM at the model's native rate
(CosyVoice2 outputs 22 050 Hz — we resample to `sample_rate` from
config so the downstream `audio_io.speaker` gets a consistent rate).

Install (on the robot):
    cd ~/src
    git clone --recursive https://github.com/FunAudioLLM/CosyVoice
    cd CosyVoice
    pip install -r requirements.txt
    pip install -e .

    # model + reference voice
    huggingface-cli download FunAudioLLM/CosyVoice2-0.5B \\
        --local-dir ~/sanad/model/local/CosyVoice2-0.5B
    # place a 3-s Khaleeji clip at model/local/khaleeji_reference_3s.wav
    # (16 kHz mono int16 WAV)

API note:
    CosyVoice2 is evolving. We use the published `inference_zero_shot`
    with `stream=True` which yields `{"tts_speech": tensor}` chunks.
    If the upstream API renames, adapt in one place — `TtsEngine._stream`.
"""

from __future__ import annotations

from pathlib import Path
from typing import AsyncIterator, Iterator, Optional

import numpy as np

from Project.Sanad.config import MODEL_DIR
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger

log = get_logger("local_tts")
_CFG = _cfg_section("local", "tts")

MODEL_SUBDIR = _CFG.get("model_subdir", "CosyVoice2-0.5B")
REFERENCE_WAV_SUBDIR = _CFG.get("reference_wav_subdir", "khaleeji_reference_3s.wav")
REFERENCE_PROMPT = _CFG.get("reference_prompt", "")
OUT_RATE = int(_CFG.get("sample_rate", 16000))
QUEUE_MAX = int(_CFG.get("queue_max", 3))
DEVICE = _CFG.get("device", "cuda")

LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
REFERENCE_WAV_PATH = MODEL_DIR / "local" / REFERENCE_WAV_SUBDIR


def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
    if src_rate == dst_rate or pcm.size == 0:
        return pcm.astype(np.int16, copy=False)
    target_len = max(1, int(len(pcm) * dst_rate / src_rate))
    return np.interp(
        np.linspace(0, len(pcm), target_len, endpoint=False),
        np.arange(len(pcm)),
        pcm.astype(np.float64),
    ).astype(np.int16)


class CosyVoiceTTS:
    """Thin async wrapper around CosyVoice2 streaming inference."""

    def __init__(self) -> None:
        self._model = None
        self._ref_speech = None       # preloaded reference tensor
        self._ref_prompt = REFERENCE_PROMPT
        self._model_rate: int = 22050

    def start(self) -> None:
        try:
            from cosyvoice.cli.cosyvoice import CosyVoice2
            from cosyvoice.utils.file_utils import load_wav
        except ImportError as exc:
            raise RuntimeError(
                f"CosyVoiceTTS requires the CosyVoice package from source: {exc}"
            )
        if not LOCAL_MODEL_DIR.exists():
            raise RuntimeError(f"CosyVoice2 model not found at {LOCAL_MODEL_DIR}")
        if not REFERENCE_WAV_PATH.exists():
            raise RuntimeError(
                f"Reference voice WAV not found at {REFERENCE_WAV_PATH}"
            )
        log.info("loading CosyVoice2: %s", LOCAL_MODEL_DIR)
        self._model = CosyVoice2(str(LOCAL_MODEL_DIR), load_jit=True, fp16=True)
        # model.sample_rate is an instance attr on CosyVoice2
        self._model_rate = getattr(self._model, "sample_rate", 22050)
        self._ref_speech = load_wav(str(REFERENCE_WAV_PATH), 16000)
        log.info("CosyVoiceTTS ready (model_rate=%d)", self._model_rate)

    def synthesize_stream(self, text: str) -> Iterator[bytes]:
        """Yield int16 PCM bytes at OUT_RATE, one streaming chunk at a time."""
        if self._model is None or self._ref_speech is None:
            return
        try:
            # CosyVoice2 streaming generator. Each step yields a tensor
            # of float32 waveform samples at the model's native rate.
            for step in self._model.inference_zero_shot(
                    text,
                    self._ref_prompt,
                    self._ref_speech,
                    stream=True):
                wave = step.get("tts_speech")
                if wave is None:
                    continue
                # tensor → float32 numpy → int16 at OUT_RATE
                arr = wave.cpu().numpy().squeeze()
                if arr.size == 0:
                    continue
                pcm_i16 = np.clip(arr * 32767.0, -32768, 32767).astype(np.int16)
                if self._model_rate != OUT_RATE:
                    pcm_i16 = _resample_int16(pcm_i16, self._model_rate, OUT_RATE)
                yield pcm_i16.tobytes()
        except Exception as exc:
            log.warning("TTS synth failed for chunk %r: %s", text[:40], exc)

    def stop(self) -> None:
        self._model = None
        self._ref_speech = None

    @property
    def output_rate(self) -> int:
        return OUT_RATE