Sanad/local/stt.py

"""faster-whisper Large V3 Turbo — GPU INT8 transcription.

Phase 2 of the local pipeline. Given an utterance (int16 PCM bytes at
16 kHz), returns transcribed text. Short / empty / no-speech results are
filtered out per config thresholds to avoid firing phantom triggers.

Install (on the robot, in the `local` env):
    pip install faster-whisper==1.0.*
    # model auto-downloads from HuggingFace on first `WhisperModel(...)` call,
    # OR pre-download to model/local/faster-whisper-large-v3-turbo/ and point
    # `local.stt.model_subdir` at it.
"""

from __future__ import annotations

from typing import Optional

import numpy as np

from Project.Sanad.config import MODEL_DIR
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger

log = get_logger("local_stt")
_CFG = _cfg_section("local", "stt")

MODEL_NAME = _CFG.get("model_name", "large-v3-turbo")
MODEL_SUBDIR = _CFG.get("model_subdir", "faster-whisper-large-v3-turbo")
DEVICE = _CFG.get("device", "cuda")
COMPUTE_TYPE = _CFG.get("compute_type", "int8_float16")
BEAM_SIZE = _CFG.get("beam_size", 1)
LANGUAGE = _CFG.get("language")          # None = auto-detect
VAD_FILTER = _CFG.get("vad_filter", False)
NO_SPEECH_THRESHOLD = _CFG.get("no_speech_threshold", 0.6)
MIN_CHARS = _CFG.get("min_utterance_chars", 2)
TEMPERATURE = _CFG.get("temperature", 0.0)

LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR


class WhisperSTT:
    """Thin wrapper around faster_whisper.WhisperModel."""

    def __init__(self) -> None:
        self._model = None

    def start(self) -> None:
        """Load the model into VRAM. ~4 s on first call, 100 ms after."""
        try:
            from faster_whisper import WhisperModel
        except ImportError as exc:
            raise RuntimeError(
                f"WhisperSTT requires 'faster-whisper': {exc}"
            )

        model_src = str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME
        log.info("loading Whisper: src=%s device=%s compute=%s",
                 model_src, DEVICE, COMPUTE_TYPE)
        self._model = WhisperModel(
            model_src,
            device=DEVICE,
            compute_type=COMPUTE_TYPE,
        )
        log.info("WhisperSTT ready")

    def transcribe(self, pcm: bytes) -> str:
        """Blocking transcription. Returns the full text or ''."""
        if self._model is None:
            log.warning("WhisperSTT.transcribe called before start()")
            return ""
        if not pcm:
            return ""
        audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
        if audio.size == 0:
            return ""
        try:
            segments, info = self._model.transcribe(
                audio,
                beam_size=BEAM_SIZE,
                language=LANGUAGE,
                vad_filter=VAD_FILTER,
                no_speech_threshold=NO_SPEECH_THRESHOLD,
                temperature=TEMPERATURE,
            )
            text = " ".join(seg.text.strip() for seg in segments).strip()
        except Exception as exc:
            log.warning("Whisper transcribe failed: %s", exc)
            return ""

        if len(text) < MIN_CHARS:
            log.debug("drop short transcript: %r", text)
            return ""
        return text

    def stop(self) -> None:
        self._model = None