127 lines
4.8 KiB
Python
127 lines
4.8 KiB
Python
"""CosyVoice2 0.5B streaming TTS — GPU.
|
|
|
|
Phase 4 of the local pipeline. Holds a 3-second reference WAV in VRAM
|
|
and synthesises streaming Arabic/English audio for every text chunk
|
|
arriving from the LLM. Emits int16 PCM at the model's native rate
|
|
(CosyVoice2 outputs 22 050 Hz — we resample to `sample_rate` from
|
|
config so the downstream `audio_io.speaker` gets a consistent rate).
|
|
|
|
Install (on the robot):
|
|
cd ~/src
|
|
git clone --recursive https://github.com/FunAudioLLM/CosyVoice
|
|
cd CosyVoice
|
|
pip install -r requirements.txt
|
|
pip install -e .
|
|
|
|
# model + reference voice
|
|
huggingface-cli download FunAudioLLM/CosyVoice2-0.5B \\
|
|
--local-dir ~/sanad/model/local/CosyVoice2-0.5B
|
|
# place a 3-s Khaleeji clip at model/local/khaleeji_reference_3s.wav
|
|
# (16 kHz mono int16 WAV)
|
|
|
|
API note:
|
|
CosyVoice2 is evolving. We use the published `inference_zero_shot`
|
|
with `stream=True` which yields `{"tts_speech": tensor}` chunks.
|
|
If the upstream API renames, adapt in one place — `TtsEngine._stream`.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from typing import AsyncIterator, Iterator, Optional
|
|
|
|
import numpy as np
|
|
|
|
from Project.Sanad.config import MODEL_DIR
|
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
|
from Project.Sanad.core.logger import get_logger
|
|
|
|
log = get_logger("local_tts")
|
|
_CFG = _cfg_section("local", "tts")
|
|
|
|
MODEL_SUBDIR = _CFG.get("model_subdir", "CosyVoice2-0.5B")
|
|
REFERENCE_WAV_SUBDIR = _CFG.get("reference_wav_subdir", "khaleeji_reference_3s.wav")
|
|
REFERENCE_PROMPT = _CFG.get("reference_prompt", "")
|
|
OUT_RATE = int(_CFG.get("sample_rate", 16000))
|
|
QUEUE_MAX = int(_CFG.get("queue_max", 3))
|
|
DEVICE = _CFG.get("device", "cuda")
|
|
|
|
LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
|
|
REFERENCE_WAV_PATH = MODEL_DIR / "local" / REFERENCE_WAV_SUBDIR
|
|
|
|
|
|
def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
|
|
if src_rate == dst_rate or pcm.size == 0:
|
|
return pcm.astype(np.int16, copy=False)
|
|
target_len = max(1, int(len(pcm) * dst_rate / src_rate))
|
|
return np.interp(
|
|
np.linspace(0, len(pcm), target_len, endpoint=False),
|
|
np.arange(len(pcm)),
|
|
pcm.astype(np.float64),
|
|
).astype(np.int16)
|
|
|
|
|
|
class CosyVoiceTTS:
|
|
"""Thin async wrapper around CosyVoice2 streaming inference."""
|
|
|
|
def __init__(self) -> None:
|
|
self._model = None
|
|
self._ref_speech = None # preloaded reference tensor
|
|
self._ref_prompt = REFERENCE_PROMPT
|
|
self._model_rate: int = 22050
|
|
|
|
def start(self) -> None:
|
|
try:
|
|
from cosyvoice.cli.cosyvoice import CosyVoice2
|
|
from cosyvoice.utils.file_utils import load_wav
|
|
except ImportError as exc:
|
|
raise RuntimeError(
|
|
f"CosyVoiceTTS requires the CosyVoice package from source: {exc}"
|
|
)
|
|
if not LOCAL_MODEL_DIR.exists():
|
|
raise RuntimeError(f"CosyVoice2 model not found at {LOCAL_MODEL_DIR}")
|
|
if not REFERENCE_WAV_PATH.exists():
|
|
raise RuntimeError(
|
|
f"Reference voice WAV not found at {REFERENCE_WAV_PATH}"
|
|
)
|
|
log.info("loading CosyVoice2: %s", LOCAL_MODEL_DIR)
|
|
self._model = CosyVoice2(str(LOCAL_MODEL_DIR), load_jit=True, fp16=True)
|
|
# model.sample_rate is an instance attr on CosyVoice2
|
|
self._model_rate = getattr(self._model, "sample_rate", 22050)
|
|
self._ref_speech = load_wav(str(REFERENCE_WAV_PATH), 16000)
|
|
log.info("CosyVoiceTTS ready (model_rate=%d)", self._model_rate)
|
|
|
|
def synthesize_stream(self, text: str) -> Iterator[bytes]:
|
|
"""Yield int16 PCM bytes at OUT_RATE, one streaming chunk at a time."""
|
|
if self._model is None or self._ref_speech is None:
|
|
return
|
|
try:
|
|
# CosyVoice2 streaming generator. Each step yields a tensor
|
|
# of float32 waveform samples at the model's native rate.
|
|
for step in self._model.inference_zero_shot(
|
|
text,
|
|
self._ref_prompt,
|
|
self._ref_speech,
|
|
stream=True):
|
|
wave = step.get("tts_speech")
|
|
if wave is None:
|
|
continue
|
|
# tensor → float32 numpy → int16 at OUT_RATE
|
|
arr = wave.cpu().numpy().squeeze()
|
|
if arr.size == 0:
|
|
continue
|
|
pcm_i16 = np.clip(arr * 32767.0, -32768, 32767).astype(np.int16)
|
|
if self._model_rate != OUT_RATE:
|
|
pcm_i16 = _resample_int16(pcm_i16, self._model_rate, OUT_RATE)
|
|
yield pcm_i16.tobytes()
|
|
except Exception as exc:
|
|
log.warning("TTS synth failed for chunk %r: %s", text[:40], exc)
|
|
|
|
def stop(self) -> None:
|
|
self._model = None
|
|
self._ref_speech = None
|
|
|
|
@property
|
|
def output_rate(self) -> int:
|
|
return OUT_RATE
|