97 lines
3.3 KiB
Python
97 lines
3.3 KiB
Python
"""faster-whisper Large V3 Turbo — GPU INT8 transcription.
|
|
|
|
Phase 2 of the local pipeline. Given an utterance (int16 PCM bytes at
|
|
16 kHz), returns transcribed text. Short / empty / no-speech results are
|
|
filtered out per config thresholds to avoid firing phantom triggers.
|
|
|
|
Install (on the robot, in the `local` env):
|
|
pip install faster-whisper==1.0.*
|
|
# model auto-downloads from HuggingFace on first `WhisperModel(...)` call,
|
|
# OR pre-download to model/local/faster-whisper-large-v3-turbo/ and point
|
|
# `local.stt.model_subdir` at it.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Optional
|
|
|
|
import numpy as np
|
|
|
|
from Project.Sanad.config import MODEL_DIR
|
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
|
from Project.Sanad.core.logger import get_logger
|
|
|
|
log = get_logger("local_stt")
|
|
_CFG = _cfg_section("local", "stt")
|
|
|
|
MODEL_NAME = _CFG.get("model_name", "large-v3-turbo")
|
|
MODEL_SUBDIR = _CFG.get("model_subdir", "faster-whisper-large-v3-turbo")
|
|
DEVICE = _CFG.get("device", "cuda")
|
|
COMPUTE_TYPE = _CFG.get("compute_type", "int8_float16")
|
|
BEAM_SIZE = _CFG.get("beam_size", 1)
|
|
LANGUAGE = _CFG.get("language") # None = auto-detect
|
|
VAD_FILTER = _CFG.get("vad_filter", False)
|
|
NO_SPEECH_THRESHOLD = _CFG.get("no_speech_threshold", 0.6)
|
|
MIN_CHARS = _CFG.get("min_utterance_chars", 2)
|
|
TEMPERATURE = _CFG.get("temperature", 0.0)
|
|
|
|
LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
|
|
|
|
|
|
class WhisperSTT:
|
|
"""Thin wrapper around faster_whisper.WhisperModel."""
|
|
|
|
def __init__(self) -> None:
|
|
self._model = None
|
|
|
|
def start(self) -> None:
|
|
"""Load the model into VRAM. ~4 s on first call, 100 ms after."""
|
|
try:
|
|
from faster_whisper import WhisperModel
|
|
except ImportError as exc:
|
|
raise RuntimeError(
|
|
f"WhisperSTT requires 'faster-whisper': {exc}"
|
|
)
|
|
|
|
model_src = str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME
|
|
log.info("loading Whisper: src=%s device=%s compute=%s",
|
|
model_src, DEVICE, COMPUTE_TYPE)
|
|
self._model = WhisperModel(
|
|
model_src,
|
|
device=DEVICE,
|
|
compute_type=COMPUTE_TYPE,
|
|
)
|
|
log.info("WhisperSTT ready")
|
|
|
|
def transcribe(self, pcm: bytes) -> str:
|
|
"""Blocking transcription. Returns the full text or ''."""
|
|
if self._model is None:
|
|
log.warning("WhisperSTT.transcribe called before start()")
|
|
return ""
|
|
if not pcm:
|
|
return ""
|
|
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
|
|
if audio.size == 0:
|
|
return ""
|
|
try:
|
|
segments, info = self._model.transcribe(
|
|
audio,
|
|
beam_size=BEAM_SIZE,
|
|
language=LANGUAGE,
|
|
vad_filter=VAD_FILTER,
|
|
no_speech_threshold=NO_SPEECH_THRESHOLD,
|
|
temperature=TEMPERATURE,
|
|
)
|
|
text = " ".join(seg.text.strip() for seg in segments).strip()
|
|
except Exception as exc:
|
|
log.warning("Whisper transcribe failed: %s", exc)
|
|
return ""
|
|
|
|
if len(text) < MIN_CHARS:
|
|
log.debug("drop short transcript: %r", text)
|
|
return ""
|
|
return text
|
|
|
|
def stop(self) -> None:
|
|
self._model = None
|