"""faster-whisper Large V3 Turbo — GPU INT8 transcription. Phase 2 of the local pipeline. Given an utterance (int16 PCM bytes at 16 kHz), returns transcribed text. Short / empty / no-speech results are filtered out per config thresholds to avoid firing phantom triggers. Install (on the robot, in the `local` env): pip install faster-whisper==1.0.* # model auto-downloads from HuggingFace on first `WhisperModel(...)` call, # OR pre-download to model/local/faster-whisper-large-v3-turbo/ and point # `local.stt.model_subdir` at it. """ from __future__ import annotations from typing import Optional import numpy as np from Project.Sanad.config import MODEL_DIR from Project.Sanad.core.config_loader import section as _cfg_section from Project.Sanad.core.logger import get_logger log = get_logger("local_stt") _CFG = _cfg_section("local", "stt") MODEL_NAME = _CFG.get("model_name", "large-v3-turbo") MODEL_SUBDIR = _CFG.get("model_subdir", "faster-whisper-large-v3-turbo") DEVICE = _CFG.get("device", "cuda") COMPUTE_TYPE = _CFG.get("compute_type", "int8_float16") BEAM_SIZE = _CFG.get("beam_size", 1) LANGUAGE = _CFG.get("language") # None = auto-detect VAD_FILTER = _CFG.get("vad_filter", False) NO_SPEECH_THRESHOLD = _CFG.get("no_speech_threshold", 0.6) MIN_CHARS = _CFG.get("min_utterance_chars", 2) TEMPERATURE = _CFG.get("temperature", 0.0) LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR class WhisperSTT: """Thin wrapper around faster_whisper.WhisperModel.""" def __init__(self) -> None: self._model = None def start(self) -> None: """Load the model into VRAM. ~4 s on first call, 100 ms after.""" try: from faster_whisper import WhisperModel except ImportError as exc: raise RuntimeError( f"WhisperSTT requires 'faster-whisper': {exc}" ) model_src = str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME log.info("loading Whisper: src=%s device=%s compute=%s", model_src, DEVICE, COMPUTE_TYPE) self._model = WhisperModel( model_src, device=DEVICE, compute_type=COMPUTE_TYPE, ) log.info("WhisperSTT ready") def transcribe(self, pcm: bytes) -> str: """Blocking transcription. Returns the full text or ''.""" if self._model is None: log.warning("WhisperSTT.transcribe called before start()") return "" if not pcm: return "" audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0 if audio.size == 0: return "" try: segments, info = self._model.transcribe( audio, beam_size=BEAM_SIZE, language=LANGUAGE, vad_filter=VAD_FILTER, no_speech_threshold=NO_SPEECH_THRESHOLD, temperature=TEMPERATURE, ) text = " ".join(seg.text.strip() for seg in segments).strip() except Exception as exc: log.warning("Whisper transcribe failed: %s", exc) return "" if len(text) < MIN_CHARS: log.debug("drop short transcript: %r", text) return "" return text def stop(self) -> None: self._model = None