"""CosyVoice2 0.5B streaming TTS — GPU. Phase 4 of the local pipeline. Holds a 3-second reference WAV in VRAM and synthesises streaming Arabic/English audio for every text chunk arriving from the LLM. Emits int16 PCM at the model's native rate (CosyVoice2 outputs 22 050 Hz — we resample to `sample_rate` from config so the downstream `audio_io.speaker` gets a consistent rate). Install (on the robot): cd ~/src git clone --recursive https://github.com/FunAudioLLM/CosyVoice cd CosyVoice pip install -r requirements.txt pip install -e . # model + reference voice huggingface-cli download FunAudioLLM/CosyVoice2-0.5B \\ --local-dir ~/sanad/model/local/CosyVoice2-0.5B # place a 3-s Khaleeji clip at model/local/khaleeji_reference_3s.wav # (16 kHz mono int16 WAV) API note: CosyVoice2 is evolving. We use the published `inference_zero_shot` with `stream=True` which yields `{"tts_speech": tensor}` chunks. If the upstream API renames, adapt in one place — `TtsEngine._stream`. """ from __future__ import annotations from pathlib import Path from typing import AsyncIterator, Iterator, Optional import numpy as np from Project.Sanad.config import MODEL_DIR from Project.Sanad.core.config_loader import section as _cfg_section from Project.Sanad.core.logger import get_logger log = get_logger("local_tts") _CFG = _cfg_section("local", "tts") MODEL_SUBDIR = _CFG.get("model_subdir", "CosyVoice2-0.5B") REFERENCE_WAV_SUBDIR = _CFG.get("reference_wav_subdir", "khaleeji_reference_3s.wav") REFERENCE_PROMPT = _CFG.get("reference_prompt", "") OUT_RATE = int(_CFG.get("sample_rate", 16000)) QUEUE_MAX = int(_CFG.get("queue_max", 3)) DEVICE = _CFG.get("device", "cuda") LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR REFERENCE_WAV_PATH = MODEL_DIR / "local" / REFERENCE_WAV_SUBDIR def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray: if src_rate == dst_rate or pcm.size == 0: return pcm.astype(np.int16, copy=False) target_len = max(1, int(len(pcm) * dst_rate / src_rate)) return np.interp( np.linspace(0, len(pcm), target_len, endpoint=False), np.arange(len(pcm)), pcm.astype(np.float64), ).astype(np.int16) class CosyVoiceTTS: """Thin async wrapper around CosyVoice2 streaming inference.""" def __init__(self) -> None: self._model = None self._ref_speech = None # preloaded reference tensor self._ref_prompt = REFERENCE_PROMPT self._model_rate: int = 22050 def start(self) -> None: try: from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.utils.file_utils import load_wav except ImportError as exc: raise RuntimeError( f"CosyVoiceTTS requires the CosyVoice package from source: {exc}" ) if not LOCAL_MODEL_DIR.exists(): raise RuntimeError(f"CosyVoice2 model not found at {LOCAL_MODEL_DIR}") if not REFERENCE_WAV_PATH.exists(): raise RuntimeError( f"Reference voice WAV not found at {REFERENCE_WAV_PATH}" ) log.info("loading CosyVoice2: %s", LOCAL_MODEL_DIR) self._model = CosyVoice2(str(LOCAL_MODEL_DIR), load_jit=True, fp16=True) # model.sample_rate is an instance attr on CosyVoice2 self._model_rate = getattr(self._model, "sample_rate", 22050) self._ref_speech = load_wav(str(REFERENCE_WAV_PATH), 16000) log.info("CosyVoiceTTS ready (model_rate=%d)", self._model_rate) def synthesize_stream(self, text: str) -> Iterator[bytes]: """Yield int16 PCM bytes at OUT_RATE, one streaming chunk at a time.""" if self._model is None or self._ref_speech is None: return try: # CosyVoice2 streaming generator. Each step yields a tensor # of float32 waveform samples at the model's native rate. for step in self._model.inference_zero_shot( text, self._ref_prompt, self._ref_speech, stream=True): wave = step.get("tts_speech") if wave is None: continue # tensor → float32 numpy → int16 at OUT_RATE arr = wave.cpu().numpy().squeeze() if arr.size == 0: continue pcm_i16 = np.clip(arr * 32767.0, -32768, 32767).astype(np.int16) if self._model_rate != OUT_RATE: pcm_i16 = _resample_int16(pcm_i16, self._model_rate, OUT_RATE) yield pcm_i16.tobytes() except Exception as exc: log.warning("TTS synth failed for chunk %r: %s", text[:40], exc) def stop(self) -> None: self._model = None self._ref_speech = None @property def output_rate(self) -> int: return OUT_RATE