"""LocalBrain — fully on-device voice pipeline. Implements the same contract as `gemini/script.py:GeminiBrain` so `voice/sanad_voice.py` can swap it in via `SANAD_VOICE_BRAIN=local`. Wires together four subsystems: Phase 1 — Silero VAD (mic → speech boundaries) Phase 2 — faster-whisper (speech → text) Phase 3 — llama.cpp + Qwen (text → streaming text chunks) Phase 4 — CosyVoice2 streaming (text chunk → cloned-voice audio) Phase 5 — barge-in (user speaks → cancel LLM + stop speaker) Phase 6 — stability — model load fails cleanly, crashes are logged. Async structure: run() is the main coroutine. It spawns three tasks: _mic_task — reads mic, VAD, Whisper, pushes user text to _llm_queue _dialogue_task — pops user text, streams LLM tokens into _tts_queue _tts_task — pops text chunks, synthesises, feeds the speaker Logging contract (matched by local/subprocess.py._track_line): "connecting to local pipeline" "listening" "USER: " "BOT: " "BARGE-IN (local)" "session error: " """ from __future__ import annotations import asyncio import time from typing import Optional from Project.Sanad.core.config_loader import section as _cfg_section from Project.Sanad.core.logger import get_logger from Project.Sanad.local.llm import LlamaServer from Project.Sanad.local.stt import WhisperSTT from Project.Sanad.local.tts import CosyVoiceTTS from Project.Sanad.local.vad import SileroVAD, FRAME_SAMPLES log = get_logger("local_brain") _CFG_SV = _cfg_section("voice", "sanad_voice") _CHUNK_BYTES = FRAME_SAMPLES * 2 # int16 mono class LocalBrain: """Fully on-device Gemini replacement.""" def __init__(self, audio_io, recorder, voice_name: Optional[str] = None, system_prompt: str = ""): self._audio = audio_io self._mic = audio_io.mic self._speaker = audio_io.speaker self._recorder = recorder self._voice = voice_name self._system_prompt = system_prompt # subsystems — instantiated here, loaded in run() self._vad = SileroVAD() self._stt = WhisperSTT() self._llm = LlamaServer() self._tts = CosyVoiceTTS() # pipeline queues self._llm_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=4) self._tts_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=4) # control flags self._stop_flag = asyncio.Event() # full shutdown self._interrupt = asyncio.Event() # per-turn barge-in self._speaking = False self._speak_start_time = 0.0 # ─── lifecycle ──────────────────────────────────────── def stop(self) -> None: self._stop_flag.set() self._interrupt.set() async def run(self) -> None: """Main entry. Loads models, runs pipeline, handles shutdown.""" log.info("connecting to local pipeline") try: await asyncio.to_thread(self._vad.start) await asyncio.to_thread(self._stt.start) await asyncio.to_thread(self._llm.start) await asyncio.to_thread(self._tts.start) except Exception as exc: log.error("session error: local pipeline startup failed — %s", exc) return log.info("listening") try: await asyncio.gather( self._mic_task(), self._dialogue_task(), self._tts_task(), ) except asyncio.CancelledError: log.info("cancelled — stopping") except Exception as exc: log.error("session error: %s", exc) finally: try: self._llm.stop() except Exception: log.warning("LlamaServer.stop failed", exc_info=True) self._tts.stop() self._stt.stop() self._vad.stop() log.info("local pipeline stopped") # ─── barge-in ───────────────────────────────────────── def _begin_barge_in(self) -> None: """Called from mic task when user starts speaking while bot is.""" if not self._speaking: return log.info("BARGE-IN (local)") self._interrupt.set() try: self._speaker.stop() except Exception: log.warning("speaker.stop during barge-in failed", exc_info=True) # drain pipelines — discard any pending LLM/TTS chunks for this turn self._drain_queue(self._llm_queue) self._drain_queue(self._tts_queue) self._speaking = False try: self._recorder.finish_turn() except Exception: pass @staticmethod def _drain_queue(q: asyncio.Queue) -> None: try: while True: q.get_nowait() q.task_done() except asyncio.QueueEmpty: pass # ─── Task 1: mic → VAD → Whisper → LLM queue ────────── async def _mic_task(self) -> None: loop = asyncio.get_event_loop() while not self._stop_flag.is_set(): try: pcm = await loop.run_in_executor( None, self._mic.read_chunk, _CHUNK_BYTES, ) except Exception: await asyncio.sleep(0.01) continue event = self._vad.process(pcm) if event == "speech_start": # user started talking — if bot is speaking, it's a barge-in if self._speaking: self._begin_barge_in() elif event == "speech_end": utt = self._vad.collected_audio() if not utt: continue try: self._recorder.capture_user(utt) except Exception: pass text = await loop.run_in_executor(None, self._stt.transcribe, utt) if not text: continue log.info("USER: %s", text) try: self._recorder.add_user_text(text) except Exception: pass # wake the LLM side — drop older pending item if full (latency > throughput) if self._llm_queue.full(): try: self._llm_queue.get_nowait() except asyncio.QueueEmpty: pass await self._llm_queue.put(text) # ─── Task 2: LLM streaming → TTS queue ──────────────── async def _dialogue_task(self) -> None: while not self._stop_flag.is_set(): try: user_text = await asyncio.wait_for( self._llm_queue.get(), timeout=0.2) except asyncio.TimeoutError: continue self._interrupt.clear() full_response = [] async for chunk in self._llm.stream( user_text, self._system_prompt, self._interrupt): if self._interrupt.is_set(): break full_response.append(chunk) await self._tts_queue.put(chunk) self._llm_queue.task_done() if full_response and not self._interrupt.is_set(): bot_text = " ".join(full_response).strip() if bot_text: log.info("BOT: %s", bot_text) try: self._recorder.add_robot_text(bot_text) except Exception: pass # ─── Task 3: TTS → speaker ──────────────────────────── async def _tts_task(self) -> None: loop = asyncio.get_event_loop() while not self._stop_flag.is_set(): try: chunk_text = await asyncio.wait_for( self._tts_queue.get(), timeout=0.2) except asyncio.TimeoutError: # idle — if we've been speaking and queue drained, close stream if self._speaking and self._llm_queue.empty() and self._tts_queue.empty(): await loop.run_in_executor(None, self._speaker.wait_finish) self._speaking = False log.info("listening") try: self._recorder.finish_turn() except Exception: pass continue if self._interrupt.is_set(): self._tts_queue.task_done() continue # synthesise this text chunk → stream to speaker if not self._speaking: await loop.run_in_executor(None, self._speaker.begin_stream) self._speaking = True self._speak_start_time = time.time() try: for pcm in self._tts.synthesize_stream(chunk_text): if self._interrupt.is_set(): break try: self._recorder.capture_robot(pcm) except Exception: pass await loop.run_in_executor( None, self._speaker.send_chunk, pcm, self._tts.output_rate, ) except Exception as exc: log.warning("TTS chunk failed: %s", exc) finally: self._tts_queue.task_done()