Sanad/local/tts.py

127 lines
4.8 KiB
Python

"""CosyVoice2 0.5B streaming TTS — GPU.
Phase 4 of the local pipeline. Holds a 3-second reference WAV in VRAM
and synthesises streaming Arabic/English audio for every text chunk
arriving from the LLM. Emits int16 PCM at the model's native rate
(CosyVoice2 outputs 22 050 Hz — we resample to `sample_rate` from
config so the downstream `audio_io.speaker` gets a consistent rate).
Install (on the robot):
cd ~/src
git clone --recursive https://github.com/FunAudioLLM/CosyVoice
cd CosyVoice
pip install -r requirements.txt
pip install -e .
# model + reference voice
huggingface-cli download FunAudioLLM/CosyVoice2-0.5B \\
--local-dir ~/sanad/model/local/CosyVoice2-0.5B
# place a 3-s Khaleeji clip at model/local/khaleeji_reference_3s.wav
# (16 kHz mono int16 WAV)
API note:
CosyVoice2 is evolving. We use the published `inference_zero_shot`
with `stream=True` which yields `{"tts_speech": tensor}` chunks.
If the upstream API renames, adapt in one place — `TtsEngine._stream`.
"""
from __future__ import annotations
from pathlib import Path
from typing import AsyncIterator, Iterator, Optional
import numpy as np
from Project.Sanad.config import MODEL_DIR
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("local_tts")
_CFG = _cfg_section("local", "tts")
MODEL_SUBDIR = _CFG.get("model_subdir", "CosyVoice2-0.5B")
REFERENCE_WAV_SUBDIR = _CFG.get("reference_wav_subdir", "khaleeji_reference_3s.wav")
REFERENCE_PROMPT = _CFG.get("reference_prompt", "")
OUT_RATE = int(_CFG.get("sample_rate", 16000))
QUEUE_MAX = int(_CFG.get("queue_max", 3))
DEVICE = _CFG.get("device", "cuda")
LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
REFERENCE_WAV_PATH = MODEL_DIR / "local" / REFERENCE_WAV_SUBDIR
def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
if src_rate == dst_rate or pcm.size == 0:
return pcm.astype(np.int16, copy=False)
target_len = max(1, int(len(pcm) * dst_rate / src_rate))
return np.interp(
np.linspace(0, len(pcm), target_len, endpoint=False),
np.arange(len(pcm)),
pcm.astype(np.float64),
).astype(np.int16)
class CosyVoiceTTS:
"""Thin async wrapper around CosyVoice2 streaming inference."""
def __init__(self) -> None:
self._model = None
self._ref_speech = None # preloaded reference tensor
self._ref_prompt = REFERENCE_PROMPT
self._model_rate: int = 22050
def start(self) -> None:
try:
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
except ImportError as exc:
raise RuntimeError(
f"CosyVoiceTTS requires the CosyVoice package from source: {exc}"
)
if not LOCAL_MODEL_DIR.exists():
raise RuntimeError(f"CosyVoice2 model not found at {LOCAL_MODEL_DIR}")
if not REFERENCE_WAV_PATH.exists():
raise RuntimeError(
f"Reference voice WAV not found at {REFERENCE_WAV_PATH}"
)
log.info("loading CosyVoice2: %s", LOCAL_MODEL_DIR)
self._model = CosyVoice2(str(LOCAL_MODEL_DIR), load_jit=True, fp16=True)
# model.sample_rate is an instance attr on CosyVoice2
self._model_rate = getattr(self._model, "sample_rate", 22050)
self._ref_speech = load_wav(str(REFERENCE_WAV_PATH), 16000)
log.info("CosyVoiceTTS ready (model_rate=%d)", self._model_rate)
def synthesize_stream(self, text: str) -> Iterator[bytes]:
"""Yield int16 PCM bytes at OUT_RATE, one streaming chunk at a time."""
if self._model is None or self._ref_speech is None:
return
try:
# CosyVoice2 streaming generator. Each step yields a tensor
# of float32 waveform samples at the model's native rate.
for step in self._model.inference_zero_shot(
text,
self._ref_prompt,
self._ref_speech,
stream=True):
wave = step.get("tts_speech")
if wave is None:
continue
# tensor → float32 numpy → int16 at OUT_RATE
arr = wave.cpu().numpy().squeeze()
if arr.size == 0:
continue
pcm_i16 = np.clip(arr * 32767.0, -32768, 32767).astype(np.int16)
if self._model_rate != OUT_RATE:
pcm_i16 = _resample_int16(pcm_i16, self._model_rate, OUT_RATE)
yield pcm_i16.tobytes()
except Exception as exc:
log.warning("TTS synth failed for chunk %r: %s", text[:40], exc)
def stop(self) -> None:
self._model = None
self._ref_speech = None
@property
def output_rate(self) -> int:
return OUT_RATE