Sanad/voice/local_tts.py

"""Local Arabic TTS using MBZUAI/speecht5_tts_clartts_ar (SpeechT5 fine-tuned on CLArTTS).

Loads model/vocoder/speaker-embedding from the local Model/ directory.
Lazy-loads on first call so the webserver starts quickly.

Output: 16 kHz mono int16 PCM bytes (matching WAV conventions).
"""

from __future__ import annotations

import re
import threading
from pathlib import Path
from typing import Any

# ── Local paths (all pre-downloaded under model/) — sourced from config ──
try:
    from Project.Sanad.core.config_loader import section as _cfg_section
    _TTS = _cfg_section("voice", "local_tts")
except Exception:
    _TTS = {}

_PROJECT_DIR = Path(__file__).resolve().parent.parent  # Sanad/
_MODEL_ROOT = _PROJECT_DIR / "model"
MODEL_DIR = _MODEL_ROOT / _TTS.get("model_subdir", "speecht5_tts_clartts_ar")
VOCODER_DIR = _MODEL_ROOT / _TTS.get("vocoder_subdir", "speecht5_hifigan")
XVECTOR_PATH = _MODEL_ROOT / _TTS.get("xvector_filename", "arabic_xvector_embedding.pt")

MODEL_ID = str(MODEL_DIR)
VOCODER_ID = str(VOCODER_DIR)
SAMPLE_RATE = _TTS.get("sample_rate", 16000)
CHANNELS = _TTS.get("channels", 1)

# Arabic diacritics (tashkeel) Unicode range – model was trained without them.
_DIACRITICS_RE = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]")


def strip_diacritics(text: str) -> str:
    return _DIACRITICS_RE.sub("", text)


class LocalTTSEngine:
    def __init__(self):
        self._lock = threading.Lock()
        self._loaded = False
        self._processor = None
        self._model = None
        self._vocoder = None
        self._speaker_embedding = None

    def _ensure_loaded(self):
        if self._loaded:
            return
        with self._lock:
            if self._loaded:
                return

            for label, p in [("Model", MODEL_DIR), ("Vocoder", VOCODER_DIR), ("XVector", XVECTOR_PATH)]:
                if not p.exists():
                    raise RuntimeError(f"{label} not found at {p}")

            import torch
            from transformers import (
                SpeechT5ForTextToSpeech,
                SpeechT5HifiGan,
                SpeechT5Processor,
            )

            self._processor = SpeechT5Processor.from_pretrained(MODEL_ID)
            self._model = SpeechT5ForTextToSpeech.from_pretrained(MODEL_ID)
            self._vocoder = SpeechT5HifiGan.from_pretrained(VOCODER_ID)
            self._speaker_embedding = torch.load(str(XVECTOR_PATH), map_location="cpu")

            self._loaded = True

    @property
    def ready(self) -> bool:
        return self._loaded

    def status(self) -> dict[str, Any]:
        return {
            "loaded": self._loaded,
            "model_dir": str(MODEL_DIR),
            "vocoder_dir": str(VOCODER_DIR),
            "xvector_path": str(XVECTOR_PATH),
            "model_exists": MODEL_DIR.exists(),
            "vocoder_exists": VOCODER_DIR.exists(),
            "xvector_exists": XVECTOR_PATH.exists(),
            "sample_rate": SAMPLE_RATE,
        }

    def synthesize(self, text: str) -> bytes:
        """Convert Arabic text to 16 kHz mono int16 PCM bytes."""
        self._ensure_loaded()
        import torch

        clean_text = strip_diacritics(text.strip())
        if not clean_text:
            raise RuntimeError("Text is empty after stripping diacritics.")

        inputs = self._processor(text=clean_text, return_tensors="pt")

        with torch.no_grad():
            speech = self._model.generate_speech(
                inputs["input_ids"],
                self._speaker_embedding,
                vocoder=self._vocoder,
            )

        # speech is a 1-D float32 tensor in [-1, 1] at 16 kHz
        pcm_float = speech.numpy()
        # Convert float32 → int16 PCM bytes
        pcm_int16 = (pcm_float * 32767).clip(-32768, 32767).astype("int16")
        return pcm_int16.tobytes()

    def synthesize_wav(self, text: str) -> bytes:
        """Return a complete WAV file (bytes) for the given text."""
        import io
        import wave

        pcm = self.synthesize(text)
        buf = io.BytesIO()
        with wave.open(buf, "wb") as wf:
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(2)  # int16
            wf.setframerate(SAMPLE_RATE)
            wf.writeframes(pcm)
        return buf.getvalue()