Sanad/local/stt.py

97 lines
3.3 KiB
Python

"""faster-whisper Large V3 Turbo — GPU INT8 transcription.
Phase 2 of the local pipeline. Given an utterance (int16 PCM bytes at
16 kHz), returns transcribed text. Short / empty / no-speech results are
filtered out per config thresholds to avoid firing phantom triggers.
Install (on the robot, in the `local` env):
pip install faster-whisper==1.0.*
# model auto-downloads from HuggingFace on first `WhisperModel(...)` call,
# OR pre-download to model/local/faster-whisper-large-v3-turbo/ and point
# `local.stt.model_subdir` at it.
"""
from __future__ import annotations
from typing import Optional
import numpy as np
from Project.Sanad.config import MODEL_DIR
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("local_stt")
_CFG = _cfg_section("local", "stt")
MODEL_NAME = _CFG.get("model_name", "large-v3-turbo")
MODEL_SUBDIR = _CFG.get("model_subdir", "faster-whisper-large-v3-turbo")
DEVICE = _CFG.get("device", "cuda")
COMPUTE_TYPE = _CFG.get("compute_type", "int8_float16")
BEAM_SIZE = _CFG.get("beam_size", 1)
LANGUAGE = _CFG.get("language") # None = auto-detect
VAD_FILTER = _CFG.get("vad_filter", False)
NO_SPEECH_THRESHOLD = _CFG.get("no_speech_threshold", 0.6)
MIN_CHARS = _CFG.get("min_utterance_chars", 2)
TEMPERATURE = _CFG.get("temperature", 0.0)
LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
class WhisperSTT:
"""Thin wrapper around faster_whisper.WhisperModel."""
def __init__(self) -> None:
self._model = None
def start(self) -> None:
"""Load the model into VRAM. ~4 s on first call, 100 ms after."""
try:
from faster_whisper import WhisperModel
except ImportError as exc:
raise RuntimeError(
f"WhisperSTT requires 'faster-whisper': {exc}"
)
model_src = str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME
log.info("loading Whisper: src=%s device=%s compute=%s",
model_src, DEVICE, COMPUTE_TYPE)
self._model = WhisperModel(
model_src,
device=DEVICE,
compute_type=COMPUTE_TYPE,
)
log.info("WhisperSTT ready")
def transcribe(self, pcm: bytes) -> str:
"""Blocking transcription. Returns the full text or ''."""
if self._model is None:
log.warning("WhisperSTT.transcribe called before start()")
return ""
if not pcm:
return ""
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
if audio.size == 0:
return ""
try:
segments, info = self._model.transcribe(
audio,
beam_size=BEAM_SIZE,
language=LANGUAGE,
vad_filter=VAD_FILTER,
no_speech_threshold=NO_SPEECH_THRESHOLD,
temperature=TEMPERATURE,
)
text = " ".join(seg.text.strip() for seg in segments).strip()
except Exception as exc:
log.warning("Whisper transcribe failed: %s", exc)
return ""
if len(text) < MIN_CHARS:
log.debug("drop short transcript: %r", text)
return ""
return text
def stop(self) -> None:
self._model = None