"""Template brain — copy this file to plug in a non-Gemini model. How to use: 1. Copy this file: `cp voice/model_script.py voice/openai_script.py` 2. Rename the class: `ModelBrain` → e.g. `OpenAIRealtimeBrain` 3. Fill in every block marked `TODO` with your provider's SDK calls. 4. Register the new brain in `voice/sanad_voice.py` inside `_build_brain()` (there's a single `elif` to add). 5. Run with `SANAD_VOICE_BRAIN=openai python3 voice/sanad_voice.py eth0`. Contract that `sanad_voice.py` expects of ANY brain: __init__(audio_io, recorder, voice_name, system_prompt) audio_io — voice.audio_io.AudioIO (exposes .mic + .speaker) recorder — voice.sanad_voice.TurnRecorder (per-turn WAV capture) voice_name — provider-specific voice id (e.g. "Charon", "alloy") system_prompt — persona string to seed the session with async run() — blocks until stopped or fatal. Reconnects are YOUR responsibility; the orchestrator won't restart you. stop() — sync signal (can be called from a signal handler). Set an asyncio.Event and let `run()` notice it. What the mic side looks like: data = self._mic.read_chunk(n_bytes) # 16 kHz int16 mono bytes # send `data` to your model's realtime-audio endpoint What the speaker side looks like: self._speaker.begin_stream() self._speaker.send_chunk(pcm, source_rate=24000) # rate is yours self._speaker.wait_finish() # blocks until playback drains # or self._speaker.stop() # cancel mid-playback (barge-in) What the recorder side looks like: self._recorder.capture_user(pcm_bytes) # mic audio for this turn self._recorder.capture_robot(pcm_bytes) # model audio for this turn self._recorder.add_user_text(str) # partial transcript self._recorder.add_robot_text(str) # partial transcript self._recorder.finish_turn() # flush to WAV + index.json """ from __future__ import annotations import asyncio from typing import Any, Optional from Project.Sanad.core.logger import get_logger log = get_logger("model_brain") class ModelBrain: """Skeleton voice brain — adapt to your provider.""" def __init__(self, audio_io, recorder, voice_name: Optional[str] = None, system_prompt: str = ""): self._audio = audio_io self._mic = audio_io.mic self._speaker = audio_io.speaker self._recorder = recorder self._voice = voice_name self._system_prompt = system_prompt self._stop_flag = asyncio.Event() # TODO: instantiate your provider's client here. Keep the client # creation cheap — connection/handshake should happen inside `run()` # so reconnects don't require re-building this object. # Example: # from openai import AsyncOpenAI # self._client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"]) self._client: Any = None # ─── lifecycle ──────────────────────────────────────── def stop(self) -> None: """Signal the run loop to exit cleanly. Safe to call from anywhere.""" self._stop_flag.set() async def run(self) -> None: """Main conversation loop. Blocks until stopped. Responsibilities: - Open a realtime session with your provider. - Forward mic audio to the model in small chunks. - Stream the model's audio response to the speaker. - Drive barge-in: when the user speaks while the model is speaking, cancel model playback and mark the turn interrupted. - On disconnect/error, back off and reconnect. """ while not self._stop_flag.is_set(): try: log.info("connecting to model...") # TODO: open a session with your provider. For websocket-style # APIs, use `async with client.realtime.connect(...) as session:`. # For request/response APIs, poll or stream in a loop. await asyncio.gather( self._send_mic_loop(), self._receive_loop(), ) except asyncio.CancelledError: break except Exception as exc: log.error("session error: %s — reconnecting in 2s", exc) await asyncio.sleep(2) # ─── mic → model ────────────────────────────────────── async def _send_mic_loop(self) -> None: """Read mic chunks and forward them to the model. Minimum responsibilities: - Loop on `self._mic.read_chunk(N_BYTES)`. - Encode to whatever format your provider expects (PCM16 mono is standard; some want base64 in JSON frames). - Respect `self._stop_flag`. Optional (highly recommended): - Measure energy; feed the mic frame to `self._recorder.capture_user` only when the user is actually speaking. - Apply echo suppression while the speaker is playing (mute or substitute silence when energy is low — keeps the model from transcribing its own voice bleed). """ chunk_bytes = 1024 # 32 ms at 16 kHz mono int16 — tune to your API loop = asyncio.get_event_loop() while not self._stop_flag.is_set(): try: data = await loop.run_in_executor( None, self._mic.read_chunk, chunk_bytes, ) except Exception: break # TODO: forward `data` to the model. Example for a hypothetical # websocket session: # await session.send({"type": "audio", "pcm16": data}) _ = data # Pace to real-time so we don't starve the event loop await asyncio.sleep(chunk_bytes / (16000 * 2)) # ─── model → speaker ────────────────────────────────── async def _receive_loop(self) -> None: """Receive model events (audio chunks, transcripts, turn markers). Event handling you need to implement: - Audio chunk → `self._speaker.send_chunk(pcm, source_rate)` (first chunk must be preceded by `self._speaker.begin_stream()`). - Model interrupted → `self._speaker.stop(); self._mic.flush()` and call `self._recorder.finish_turn()`. - User transcript → `self._recorder.add_user_text(text)`. - Model transcript → `self._recorder.add_robot_text(text)`. - Turn complete → `self._speaker.wait_finish(); self._recorder.finish_turn(); mic.flush()`. """ while not self._stop_flag.is_set(): # TODO: iterate your provider's event stream and dispatch. await asyncio.sleep(0.1)