Update 2026-04-20 17:59:46

This commit is contained in:
kassam 2026-04-20 17:59:47 +04:00
parent 71c45027f5
commit 94e4a9c4cb
26 changed files with 2186 additions and 790 deletions

34
config/gemini_config.json Normal file
View File

@ -0,0 +1,34 @@
{
"_description": "Tunables for gemini/* modules. Loaded via core.config_loader.load('gemini'). API credentials (api_key, model, voice_name) still live in core_config.json > gemini_defaults — single source of truth shared with config.py.",
"client": {
"_comment": "gemini/client.py — short-session WebSocket client used by dashboard /generate + typed replay. default_system_prompt comes from core.gemini_defaults.",
"recv_timeout_sec": 30,
"reconnect_max_attempts": 3,
"reconnect_initial_delay_sec": 1.0,
"reconnect_max_delay_sec": 10.0
},
"subprocess": {
"_comment": "gemini/subprocess.py — GeminiSubprocess supervisor. Spawns voice/sanad_voice.py as a child, tails stdout for Gemini-specific log markers, exposes transcript + state to the dashboard.",
"log_tail_size": 2000,
"transcript_tail_size": 30,
"log_name": "gemini_subprocess",
"stop_timeout_sec": 3.0,
"terminate_timeout_sec": 2.0,
"noisy_prefixes": [
"ALSA lib ",
"Expression 'alsa_",
"Cannot connect to server socket",
"jack server is not running"
],
"noisy_fragments": [
"Unknown PCM",
"Evaluate error",
"snd_pcm_open_noupdate",
"PaAlsaStream",
"snd_config_evaluate",
"snd_func_refer"
]
}
}

92
config/local_config.json Normal file
View File

@ -0,0 +1,92 @@
{
"_description": "Tunables for local/* — fully on-device voice pipeline (Silero VAD → Whisper → Qwen via llama.cpp → CosyVoice2). Loaded via core.config_loader.load('local').",
"subprocess": {
"_comment": "local/subprocess.py — LocalSubprocess supervisor. Mirrors gemini/subprocess.py. IMPORTANT: python_bin points at the `local` conda env (Python 3.8 + Jetson CUDA torch) so CosyVoice+Whisper run with GPU, while the dashboard/Gemini stack stays in gemini_sdk (Python 3.10).",
"python_bin": "/home/unitree/miniconda3/envs/local/bin/python",
"log_tail_size": 2000,
"transcript_tail_size": 30,
"log_name": "local_subprocess",
"stop_timeout_sec": 5.0,
"terminate_timeout_sec": 3.0,
"noisy_prefixes": [
"ALSA lib ",
"Expression 'alsa_",
"Cannot connect to server socket",
"jack server is not running"
],
"noisy_fragments": [
"Unknown PCM",
"Evaluate error",
"snd_pcm_open_noupdate",
"PaAlsaStream"
]
},
"vad": {
"_comment": "Silero VAD — CPU. Emits speech_start / speech_end events.",
"sample_rate": 16000,
"frame_ms": 32,
"threshold": 0.55,
"min_silence_ms": 400,
"min_speech_ms": 250,
"pad_start_ms": 200,
"pad_end_ms": 200,
"device": "cpu"
},
"stt": {
"_comment": "faster-whisper Large V3 Turbo, INT8 on GPU.",
"model_name": "large-v3-turbo",
"model_subdir": "faster-whisper-large-v3-turbo",
"device": "cuda",
"compute_type": "int8_float16",
"beam_size": 1,
"language": null,
"vad_filter": false,
"no_speech_threshold": 0.6,
"min_utterance_chars": 2,
"temperature": 0.0
},
"llm": {
"_comment": "Qwen 2.5 Instruct via Ollama (default) OR self-managed llama.cpp. Set backend to pick.",
"backend": "ollama",
"_ollama_comment": "Ollama daemon — assumes `ollama serve` is running; `ollama pull qwen2.5:1.5b` to fetch.",
"ollama_host": "127.0.0.1",
"ollama_port": 11434,
"ollama_model": "qwen2.5:1.5b",
"ollama_keep_alive": "5m",
"_llamacpp_comment": "Self-managed llama-server subprocess. Only used when backend='llama_cpp'.",
"model_subdir": "qwen2.5-1.5b-instruct-q4_k_m.gguf",
"server_binary": "llama-server",
"host": "127.0.0.1",
"port": 8080,
"n_gpu_layers": 99,
"ctx_size": 2048,
"threads": 4,
"startup_timeout_sec": 30,
"_shared_comment": "Generation params — both backends.",
"request_timeout_sec": 30,
"max_tokens": 200,
"temperature": 0.7,
"top_p": 0.9,
"stop": ["<|im_end|>", "\n\n\n"],
"chunk_delimiters": ".,?!؟،",
"chunk_min_chars": 8
},
"tts": {
"_comment": "CosyVoice2 0.5B streaming — GPU. Uses a 3s reference WAV for voice cloning.",
"model_subdir": "CosyVoice2-0.5B",
"reference_wav_subdir": "khaleeji_reference_3s.wav",
"reference_prompt": "",
"stream_chunk_sec": 0.25,
"sample_rate": 16000,
"queue_max": 3,
"device": "cuda"
}
}

View File

@ -50,39 +50,12 @@
"dir_relative": "data/recordings" "dir_relative": "data/recordings"
}, },
"system_prompt": {
"_comment": "Persona filename lives in core.script_files.persona; default text in core.gemini_defaults.default_system_prompt. This section is now metadata-only."
},
"typed_replay": { "typed_replay": {
"_comment": "voice/typed_replay.py — max_text_len comes from dashboard.api_input", "_comment": "voice/typed_replay.py — max_text_len comes from dashboard.api_input",
"monitor_chunk_size": 512, "monitor_chunk_size": 512,
"monitor_tail_sec": 0.2 "monitor_tail_sec": 0.2
}, },
"live_gemini_subprocess": {
"_comment": "voice/live_gemini_subprocess.py — LiveGeminiSubprocess",
"log_tail_size": 2000,
"transcript_tail_size": 30,
"log_name": "live_gemini_subprocess",
"stop_timeout_sec": 3.0,
"terminate_timeout_sec": 2.0,
"noisy_prefixes": [
"ALSA lib ",
"Expression 'alsa_",
"Cannot connect to server socket",
"jack server is not running"
],
"noisy_fragments": [
"Unknown PCM",
"Evaluate error",
"snd_pcm_open_noupdate",
"PaAlsaStream",
"snd_config_evaluate",
"snd_func_refer"
]
},
"live_voice_loop": { "live_voice_loop": {
"_comment": "voice/live_voice_loop.py — arm phrase dispatcher. arm_txt filename comes from core.script_files.arm_phrases", "_comment": "voice/live_voice_loop.py — arm phrase dispatcher. arm_txt filename comes from core.script_files.arm_phrases",
"trigger_log_size": 100, "trigger_log_size": 100,
@ -97,27 +70,5 @@
"xvector_filename": "arabic_xvector_embedding.pt", "xvector_filename": "arabic_xvector_embedding.pt",
"sample_rate": 16000, "sample_rate": 16000,
"channels": 1 "channels": 1
},
"gemini_client": {
"_comment": "voice/gemini_client.py — default_system_prompt comes from core.gemini_defaults",
"recv_timeout_sec": 30,
"reconnect_max_attempts": 3,
"reconnect_initial_delay_sec": 1.0,
"reconnect_max_delay_sec": 10.0
},
"asr_buffer": {
"_comment": "text_utils.maybe_trigger_arm state machine defaults",
"window_sec": 2.0,
"short_token_bonus_sec": 1.0,
"join_no_space_maxlen": 2,
"max_chars": 120,
"stream_max_chars": 80,
"trigger_dedup_window_sec": 2.0,
"pending_arm_ttl_sec": 6.0,
"pending_arm_fallback_sec": 0.65,
"dup_call_window_sec": 0.25,
"dup_asr_repeat_window_sec": 0.9
} }
} }

View File

@ -46,7 +46,7 @@ class Brain:
self._lock = asyncio.Lock() self._lock = asyncio.Lock()
# Sub-modules are injected after construction so imports stay lazy. # Sub-modules are injected after construction so imports stay lazy.
self._voice = None # voice.gemini_client.GeminiVoiceClient self._voice = None # gemini.client.GeminiVoiceClient
self._audio_mgr = None # voice.audio_manager.AudioManager self._audio_mgr = None # voice.audio_manager.AudioManager
self._arm = None # motion.arm_controller.ArmController self._arm = None # motion.arm_controller.ArmController
self._macro_rec = None # motion.macro_recorder.MacroRecorder self._macro_rec = None # motion.macro_recorder.MacroRecorder

View File

@ -1,6 +1,6 @@
"""Live Voice Commands — voice-to-arm phrase trigger dispatcher. """Live Voice Commands — voice-to-arm phrase trigger dispatcher.
Listens to LiveGeminiSubprocess user transcripts, matches against Listens to GeminiSubprocess user transcripts, matches against
sanad_arm.txt phrases, and fires ARM.trigger_action_by_id. sanad_arm.txt phrases, and fires ARM.trigger_action_by_id.
Endpoints: Endpoints:

View File

@ -193,7 +193,7 @@ async def update_api_key(payload: ApiKeyPayload):
raise HTTPException(500, f"Could not save config: {exc}") raise HTTPException(500, f"Could not save config: {exc}")
# Hot-swap the in-memory module globals. # Hot-swap the in-memory module globals.
# Both Project.Sanad.config AND Project.Sanad.voice.gemini_client # Both Project.Sanad.config AND Project.Sanad.gemini.client
# have their OWN reference to GEMINI_API_KEY (the latter was created # have their OWN reference to GEMINI_API_KEY (the latter was created
# at `from Project.Sanad.config import GEMINI_API_KEY` at import time). # at `from Project.Sanad.config import GEMINI_API_KEY` at import time).
# Python's `from X import Y` binds a local name — updating config.Y # Python's `from X import Y` binds a local name — updating config.Y
@ -205,10 +205,10 @@ async def update_api_key(payload: ApiKeyPayload):
log.exception("could not patch config.GEMINI_API_KEY") log.exception("could not patch config.GEMINI_API_KEY")
try: try:
import Project.Sanad.voice.gemini_client as _gc import Project.Sanad.gemini.client as _gc
_gc.GEMINI_API_KEY = key _gc.GEMINI_API_KEY = key
except Exception: except Exception:
log.exception("could not patch gemini_client.GEMINI_API_KEY") log.exception("could not patch gemini.client.GEMINI_API_KEY")
# Disconnect any live session so reconnect uses the new key. # Disconnect any live session so reconnect uses the new key.
from Project.Sanad.main import voice_client from Project.Sanad.main import voice_client

View File

@ -8,7 +8,7 @@ Usage:
python3 voice_example.py gemini "hello" # one-shot Gemini text→audio python3 voice_example.py gemini "hello" # one-shot Gemini text→audio
python3 voice_example.py local_tts "hello" # local Coqui TTS python3 voice_example.py local_tts "hello" # local Coqui TTS
python3 voice_example.py typed_replay "hello" # typed replay engine python3 voice_example.py typed_replay "hello" # typed replay engine
python3 voice_example.py live # spawn LiveGeminiSubprocess python3 voice_example.py live # spawn GeminiSubprocess
python3 voice_example.py status # show status of all subsystems python3 voice_example.py status # show status of all subsystems
Assumes Project.Sanad is importable (run from repo root or with PYTHONPATH set). Assumes Project.Sanad is importable (run from repo root or with PYTHONPATH set).
@ -23,7 +23,7 @@ import sys
def _demo_gemini(text: str) -> None: def _demo_gemini(text: str) -> None:
"""One-shot: connect Gemini, send text, play reply.""" """One-shot: connect Gemini, send text, play reply."""
from Project.Sanad.voice.gemini_client import GeminiVoiceClient from Project.Sanad.gemini.client import GeminiVoiceClient
from Project.Sanad.voice.audio_manager import AudioManager from Project.Sanad.voice.audio_manager import AudioManager
async def run(): async def run():
@ -55,7 +55,7 @@ def _demo_local_tts(text: str) -> None:
def _demo_typed_replay(text: str) -> None: def _demo_typed_replay(text: str) -> None:
"""Exercise the TypedReplayEngine end-to-end.""" """Exercise the TypedReplayEngine end-to-end."""
from Project.Sanad.voice.gemini_client import GeminiVoiceClient from Project.Sanad.gemini.client import GeminiVoiceClient
from Project.Sanad.voice.audio_manager import AudioManager from Project.Sanad.voice.audio_manager import AudioManager
from Project.Sanad.voice.typed_replay import TypedReplayEngine from Project.Sanad.voice.typed_replay import TypedReplayEngine
@ -73,9 +73,9 @@ def _demo_typed_replay(text: str) -> None:
def _demo_live() -> None: def _demo_live() -> None:
"""Spawn the live voice subprocess — same as dashboard /api/live-subprocess.""" """Spawn the live voice subprocess — same as dashboard /api/live-subprocess."""
from Project.Sanad.voice.live_gemini_subprocess import LiveGeminiSubprocess from Project.Sanad.gemini.subprocess import GeminiSubprocess
mgr = LiveGeminiSubprocess() mgr = GeminiSubprocess()
info = mgr.start() info = mgr.start()
print(f"[live] {info}") print(f"[live] {info}")
print("Running. Ctrl+C to stop.") print("Running. Ctrl+C to stop.")
@ -90,7 +90,7 @@ def _demo_live() -> None:
def _demo_status() -> None: def _demo_status() -> None:
"""Print status of all voice subsystems.""" """Print status of all voice subsystems."""
from Project.Sanad.voice.gemini_client import GeminiVoiceClient from Project.Sanad.gemini.client import GeminiVoiceClient
try: try:
from Project.Sanad.voice.local_tts import LocalTTSEngine from Project.Sanad.voice.local_tts import LocalTTSEngine
except Exception: except Exception:

0
gemini/__init__.py Normal file
View File

View File

@ -30,7 +30,7 @@ from Project.Sanad.core.logger import get_logger
log = get_logger("gemini_client") log = get_logger("gemini_client")
_GC = _cfg_section("voice", "gemini_client") _GC = _cfg_section("gemini", "client")
# Default system prompt — SINGLE SOURCE in core.gemini_defaults # Default system prompt — SINGLE SOURCE in core.gemini_defaults
_DEFAULT_SYSTEM_PROMPT = _cfg_section("core", "gemini_defaults").get( _DEFAULT_SYSTEM_PROMPT = _cfg_section("core", "gemini_defaults").get(
"default_system_prompt", "default_system_prompt",

370
gemini/script.py Normal file
View File

@ -0,0 +1,370 @@
"""Gemini brain — live conversation loop using the google-genai SDK.
Implements the VoiceBrain contract documented in `voice/model_script.py`:
__init__(audio_io, recorder, voice_name, system_prompt)
async run()
stop()
Owns everything Gemini-specific: the `genai.Client`, `LiveConnectConfig`,
the session connect/receive loop, VAD-based barge-in, echo suppression,
reconnect backoff. Hardware I/O is delegated to `audio_io` and per-turn
WAV capture to `recorder` both are model-agnostic.
Env overrides:
SANAD_GEMINI_MODEL Gemini Live model id (without "models/" prefix)
"""
from __future__ import annotations
import array
import asyncio
import os
import time
from typing import Any, Optional
import numpy as np
from google import genai
from google.genai import types
from Project.Sanad.config import (
CHUNK_SIZE,
GEMINI_API_KEY,
GEMINI_VOICE,
RECEIVE_SAMPLE_RATE,
SEND_SAMPLE_RATE,
)
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("gemini_brain")
_SV = _cfg_section("voice", "sanad_voice")
_VAD = _cfg_section("voice", "vad")
_BI = _cfg_section("voice", "barge_in")
_MODEL = os.environ.get(
"SANAD_GEMINI_MODEL",
"gemini-2.5-flash-native-audio-preview-12-2025",
)
_MIC_GAIN = _SV.get("mic_gain", 1.0)
_SESSION_TIMEOUT = _SV.get("session_timeout_sec", 660)
_MAX_RECONNECT_DELAY = _SV.get("max_reconnect_delay_sec", 30)
_MAX_CONSECUTIVE_ERRORS = _SV.get("max_consecutive_errors", 10)
_NO_MESSAGES_TIMEOUT = _SV.get("no_messages_timeout_sec", 30)
_CHUNK_BYTES = CHUNK_SIZE * 2
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES
def _audio_energy(pcm: bytes) -> int:
try:
samples = array.array("h", pcm)
return sum(abs(s) for s in samples) // len(samples) if samples else 0
except Exception:
return 0
class GeminiBrain:
"""Gemini Live conversation brain — reconnect-safe."""
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
system_prompt: str = ""):
self._audio = audio_io
self._mic = audio_io.mic
self._speaker = audio_io.speaker
self._recorder = recorder
self._voice = voice_name or GEMINI_VOICE
self._system_prompt = system_prompt
self._api_key = GEMINI_API_KEY
self._stop_flag = asyncio.Event()
# per-session state (reset in the outer reconnect loop)
self._speaking = False
self._stream_started = False
self._barge_block_until = 0.0
self._ai_speak_start = 0.0
self._last_ai_audio = 0.0
self._done: Optional[asyncio.Event] = None
def stop(self) -> None:
"""Signal the run loop to exit at the next opportunity."""
try:
self._stop_flag.set()
except Exception:
pass
# ─── public entry point ───────────────────────────────
async def run(self) -> None:
client = genai.Client(api_key=self._api_key)
config = self._build_config()
session_num = 0
start_time = time.time()
consecutive_errors = 0
while not self._stop_flag.is_set():
session_num += 1
self._reset_turn_state()
uptime_min = (time.time() - start_time) / 60
try:
log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
session_num, uptime_min)
async with client.aio.live.connect(model=_MODEL, config=config) as session:
log.info("connected — speak anytime!")
consecutive_errors = 0
self._mic.flush()
self._done = asyncio.Event()
try:
await asyncio.wait_for(
asyncio.gather(
self._send_mic_loop(session),
self._receive_loop(session),
),
timeout=_SESSION_TIMEOUT,
)
except asyncio.TimeoutError:
log.warning("session timed out after %ds", _SESSION_TIMEOUT)
except asyncio.CancelledError:
log.warning("session cancelled")
log.info("session #%d ended — reconnecting in 1s", session_num)
self._speaker.stop()
self._mic.flush()
await asyncio.sleep(1)
except asyncio.CancelledError:
log.info("cancelled — stopping")
break
except KeyboardInterrupt:
log.info("keyboard interrupt — stopping")
break
except Exception as exc:
consecutive_errors += 1
delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
log.error("session error (#%d): %s — reconnecting in %ds",
consecutive_errors, exc, delay)
await asyncio.sleep(delay)
if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
log.warning("%d consecutive errors — recreating client",
consecutive_errors)
try:
client = genai.Client(api_key=self._api_key)
consecutive_errors = 0
except Exception as ce:
log.error("client recreation failed: %s", ce)
# ─── Gemini config ────────────────────────────────────
def _build_config(self) -> types.LiveConnectConfig:
return types.LiveConnectConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=self._voice,
),
),
),
realtime_input_config=types.RealtimeInputConfig(
automatic_activity_detection=types.AutomaticActivityDetection(
disabled=False,
start_of_speech_sensitivity=getattr(
types.StartSensitivity,
_VAD.get("start_sensitivity", "START_SENSITIVITY_HIGH"),
),
end_of_speech_sensitivity=getattr(
types.EndSensitivity,
_VAD.get("end_sensitivity", "END_SENSITIVITY_LOW"),
),
prefix_padding_ms=_VAD.get("prefix_padding_ms", 20),
silence_duration_ms=_VAD.get("silence_duration_ms", 200),
),
),
input_audio_transcription=types.AudioTranscriptionConfig(),
output_audio_transcription=types.AudioTranscriptionConfig(),
system_instruction=types.Content(
parts=[types.Part(text=self._system_prompt)],
),
)
# ─── state helpers ────────────────────────────────────
def _reset_turn_state(self) -> None:
self._speaking = False
self._stream_started = False
self._barge_block_until = 0.0
self._ai_speak_start = 0.0
self._last_ai_audio = 0.0
def _interrupt(self, source: str = "local") -> None:
self._speaking = False
self._stream_started = False
self._speaker.stop()
self._mic.flush()
self._recorder.finish_turn()
log.info("interrupt (%s)", source)
# ─── mic send loop ────────────────────────────────────
async def _send_mic_loop(self, session: Any) -> None:
threshold = _BI.get("threshold", 500)
chunks_needed = _BI.get("loud_chunks_needed", 3)
cooldown = _BI.get("cooldown_sec", 0.3)
echo_suppress_below = _BI.get("echo_suppress_below", 500)
grace = _BI.get("ai_speak_grace_sec", 0.15)
loop = asyncio.get_event_loop()
loud_count = 0
last_activity = time.time()
while not self._done.is_set() and not self._stop_flag.is_set():
try:
raw = await loop.run_in_executor(
None, self._mic.read_chunk, _CHUNK_BYTES,
)
except Exception:
break
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
data = samples.tobytes()
energy = _audio_energy(data)
now = time.time()
# Barge-in: after AI starts speaking, sustained user energy cuts it.
if self._speaking and now >= self._barge_block_until:
if (now - self._ai_speak_start) >= grace:
if energy > threshold:
loud_count += 1
else:
loud_count = max(0, loud_count - 1)
if loud_count > chunks_needed:
log.info("BARGE-IN (e=%d)", energy)
self._interrupt("barge-in")
loud_count = 0
self._barge_block_until = now + cooldown
# Echo suppression: while AI is speaking, mask quiet frames so the
# mic doesn't feed the model its own voice bleed.
send_data = data
if self._speaking and energy < echo_suppress_below:
send_data = _SILENCE_PCM
# Record user audio when clearly speaking and AI isn't.
if energy > 250 and not self._speaking:
self._recorder.capture_user(data)
# Keep-alive watchdog
if energy > 250:
last_activity = now
elif now - last_activity > 10:
log.info("alive (no speech %.0fs, e=%d)",
now - last_activity, energy)
last_activity = now
try:
await session.send_realtime_input(
audio=types.Blob(
data=send_data,
mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
),
)
except asyncio.CancelledError:
return
except Exception as exc:
log.warning("mic send failed: %s — ending session", exc)
self._done.set()
return
await asyncio.sleep(CHUNK_SIZE / SEND_SAMPLE_RATE)
log.info("send_mic task ended")
# ─── receive loop ─────────────────────────────────────
async def _receive_loop(self, session: Any) -> None:
loop = asyncio.get_event_loop()
try:
last_recv = time.time()
while not self._done.is_set() and not self._stop_flag.is_set():
async for response in session.receive():
last_recv = time.time()
if self._done.is_set():
break
if hasattr(response, "go_away") and response.go_away is not None:
log.info("server going away — will reconnect")
self._done.set()
return
sc = response.server_content
if sc is None:
continue
if sc.interrupted is True:
if self._speaking:
log.info("Gemini interrupted")
self._interrupt("gemini")
continue
if sc.input_transcription:
text = (sc.input_transcription.text or "").strip()
if text and not self._speaking:
log.info("USER: %s", text)
self._recorder.add_user_text(text)
if sc.output_transcription:
text = (sc.output_transcription.text or "").strip()
if text:
log.info("BOT : %s", text)
self._recorder.add_robot_text(text)
if sc.model_turn:
for part in sc.model_turn.parts:
if part.inline_data and part.inline_data.data:
now = time.time()
if not self._speaking:
self._ai_speak_start = now
self._speaking = True
self._last_ai_audio = now
raw_audio = part.inline_data.data
self._recorder.capture_robot(raw_audio)
audio = np.frombuffer(raw_audio, dtype=np.int16)
if not self._stream_started:
await loop.run_in_executor(
None, self._speaker.begin_stream,
)
self._stream_started = True
await loop.run_in_executor(
None, self._speaker.send_chunk,
audio, RECEIVE_SAMPLE_RATE,
)
if sc.turn_complete:
if (self._speaking and self._stream_started
and not self._speaker.interrupted):
log.info("speaker %.1fs", self._speaker.total_sent_sec)
await loop.run_in_executor(
None, self._speaker.wait_finish,
)
elif self._speaking and self._speaker.interrupted:
log.info("speaker interrupted")
self._speaking = False
self._stream_started = False
self._mic.flush()
self._recorder.finish_turn()
log.info("listening")
if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
log.warning("no messages from Gemini for %ds — session dead",
_NO_MESSAGES_TIMEOUT)
break
await asyncio.sleep(0.1)
except Exception as exc:
log.warning("receive ended: %s", exc)
finally:
self._done.set()

View File

@ -1,7 +1,11 @@
"""Live Gemini Subprocess Manager — start/stop sanad_voice.py as managed child. """Gemini live subprocess supervisor.
Mirrors gemini_voice_v2/LiveGeminiManager. Launches the voice script as a Spawns `voice/sanad_voice.py` as a managed child with `SANAD_VOICE_BRAIN=gemini`,
subprocess, tails stdout, parses state transitions and user transcripts. tails the child's stdout, and extracts state transitions + user transcripts
from the Gemini-specific log lines emitted by `gemini/script.py:GeminiBrain`.
When a new model is added, build its own sibling supervisor (see
`voice/model_subprocess.py` for the template) do not refactor this file.
""" """
from __future__ import annotations from __future__ import annotations
@ -22,9 +26,9 @@ from Project.Sanad.config import BASE_DIR, LOGS_DIR, SCRIPTS_DIR, LIVE_TUNE
from Project.Sanad.core.config_loader import section as _cfg_section from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger from Project.Sanad.core.logger import get_logger
log = get_logger("live_subprocess") log = get_logger("gemini_subprocess")
_LS_CFG = _cfg_section("voice", "live_gemini_subprocess") _LS_CFG = _cfg_section("gemini", "subprocess")
def _resolve_live_script() -> Path: def _resolve_live_script() -> Path:
@ -51,7 +55,7 @@ TRANSCRIPT_TAIL_SIZE = _LS_CFG.get("transcript_tail_size", 30)
# Persistent on-disk log for the full subprocess session. # Persistent on-disk log for the full subprocess session.
LIVE_LOG_DIR = LOGS_DIR LIVE_LOG_DIR = LOGS_DIR
LIVE_LOG_NAME = _LS_CFG.get("log_name", "live_gemini_subprocess") LIVE_LOG_NAME = _LS_CFG.get("log_name", "gemini_subprocess")
_STOP_TIMEOUT_SEC = _LS_CFG.get("stop_timeout_sec", 3.0) _STOP_TIMEOUT_SEC = _LS_CFG.get("stop_timeout_sec", 3.0)
_TERMINATE_TIMEOUT_SEC = _LS_CFG.get("terminate_timeout_sec", 2.0) _TERMINATE_TIMEOUT_SEC = _LS_CFG.get("terminate_timeout_sec", 2.0)
@ -66,7 +70,7 @@ _NOISY_FRAGMENTS = tuple(_LS_CFG.get("noisy_fragments", [
])) ]))
class LiveGeminiSubprocess: class GeminiSubprocess:
def __init__(self): def __init__(self):
self._lock = threading.Lock() self._lock = threading.Lock()
self.process: subprocess.Popen | None = None self.process: subprocess.Popen | None = None
@ -102,23 +106,33 @@ class LiveGeminiSubprocess:
self.state_message = msg self.state_message = msg
def _track_line(self, line: str): def _track_line(self, line: str):
if "Connecting to Gemini" in line: """Parse Gemini-specific log markers emitted by `gemini/script.py`.
Must stay in lock-step with the `log.info(...)` strings in
`GeminiBrain`. If you add a new state, add the emit in the brain
AND the matching detector here in one PR.
"""
if "connecting to Gemini" in line:
self._set_state("connecting", line) self._set_state("connecting", line)
elif "Connected! Sanad is listening" in line: elif "connected — speak anytime" in line or "connected - speak anytime" in line:
self._set_state("listening", "Listening for speech.") self._set_state("listening", "Listening for speech.")
elif "USER SAID:" in line: elif " USER: " in line or line.strip().startswith("USER:"):
text = line.split("USER SAID:", 1)[1].strip() # GeminiBrain emits: log.info("USER: %s", text)
text = line.split("USER:", 1)[1].strip()
if text: if text:
self.last_user_text = text self.last_user_text = text
self.user_transcript.append(text) self.user_transcript.append(text)
self._set_state("hearing", f"User: {text}") self._set_state("hearing", f"User: {text}")
elif "Interruption!" in line: elif "BARGE-IN" in line or "Gemini interrupted" in line or "interrupt (" in line:
self._set_state("interrupting", line) self._set_state("interrupting", line)
elif any(k in line for k in ("Mic Error:", "Speaker Error:", "Fatal Error:")): elif "listening" in line.lower() and "no speech" not in line:
# Fires on "listening" (post-turn) — keep the state fresh.
self._set_state("listening", "Listening for speech.")
elif "session error" in line or "client recreation failed" in line:
self._set_state("error", line) self._set_state("error", line)
elif "WebSocket closed." in line: elif "server going away" in line or "session ended" in line or "session dead" in line:
self._set_state("warning", line) self._set_state("warning", line)
elif "Ma'a Salama" in line: elif "keyboard interrupt" in line or "cancelled — stopping" in line:
self._set_state("stopped", line) self._set_state("stopped", line)
def _reader_loop(self): def _reader_loop(self):

0
local/__init__.py Normal file
View File

305
local/llm.py Normal file
View File

@ -0,0 +1,305 @@
"""LLM layer — Qwen 2.5 Instruct via Ollama (default) or self-managed llama.cpp.
Phase 3 of the local pipeline. Two backends, selectable via
`config/local_config.json > llm.backend`:
"ollama" talk to a running `ollama serve` daemon (default).
No subprocess management, no CUDA build. Just:
ollama pull qwen2.5:1.5b
# daemon usually auto-starts; if not: `ollama serve &`
"llama_cpp" launch our own `llama-server` subprocess. Requires
a CUDA build of llama.cpp and a GGUF file at
`model/local/<llm.model_subdir>`.
Both backends stream tokens and chunk them on sentence delimiters so
the TTS can start synthesising before the LLM finishes.
"""
from __future__ import annotations
import asyncio
import json
import shutil
import subprocess
import time
from typing import AsyncIterator, Optional
from Project.Sanad.config import MODEL_DIR
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("local_llm")
_CFG = _cfg_section("local", "llm")
BACKEND = (_CFG.get("backend") or "ollama").strip().lower()
# Ollama
OLLAMA_HOST = _CFG.get("ollama_host", "127.0.0.1")
OLLAMA_PORT = int(_CFG.get("ollama_port", 11434))
OLLAMA_MODEL = _CFG.get("ollama_model", "qwen2.5:1.5b")
OLLAMA_KEEP_ALIVE = _CFG.get("ollama_keep_alive", "5m")
# llama.cpp
MODEL_SUBDIR = _CFG.get("model_subdir", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
SERVER_BIN = _CFG.get("server_binary", "llama-server")
HOST = _CFG.get("host", "127.0.0.1")
PORT = int(_CFG.get("port", 8080))
N_GPU_LAYERS = _CFG.get("n_gpu_layers", 99)
CTX_SIZE = _CFG.get("ctx_size", 2048)
THREADS = _CFG.get("threads", 4)
STARTUP_TIMEOUT = _CFG.get("startup_timeout_sec", 30)
# Shared generation params
REQUEST_TIMEOUT = _CFG.get("request_timeout_sec", 30)
MAX_TOKENS = _CFG.get("max_tokens", 200)
TEMPERATURE = _CFG.get("temperature", 0.7)
TOP_P = _CFG.get("top_p", 0.9)
STOP_SEQS = list(_CFG.get("stop", ["<|im_end|>"]))
CHUNK_DELIMS = _CFG.get("chunk_delimiters", ".,?!؟،")
CHUNK_MIN_CHARS = int(_CFG.get("chunk_min_chars", 8))
LOCAL_MODEL_PATH = MODEL_DIR / "local" / MODEL_SUBDIR
class LlamaServer:
"""Thin wrapper — owns subprocess (llama.cpp) or no-op (ollama)."""
def __init__(self) -> None:
self._proc: Optional[subprocess.Popen] = None
# ─── lifecycle ────────────────────────────────────────
def start(self) -> None:
if BACKEND == "ollama":
self._check_ollama()
log.info("LLM backend=ollama model=%s (@ %s:%d)",
OLLAMA_MODEL, OLLAMA_HOST, OLLAMA_PORT)
return
if BACKEND == "llama_cpp":
self._start_llama_cpp()
return
raise RuntimeError(f"unknown llm.backend: {BACKEND!r}")
def stop(self) -> None:
if self._proc is None:
return
try:
self._proc.terminate()
self._proc.wait(timeout=3)
except subprocess.TimeoutExpired:
self._proc.kill()
self._proc.wait(timeout=2)
except Exception as exc:
log.warning("llama-server stop error: %s", exc)
self._proc = None
def alive(self) -> bool:
if BACKEND == "ollama":
return self._ping_ollama()
return self._proc is not None and self._proc.poll() is None
# ─── Ollama backend ───────────────────────────────────
def _check_ollama(self) -> None:
"""Verify the Ollama daemon is running + the model is pulled."""
import urllib.request
tags_url = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/tags"
try:
with urllib.request.urlopen(tags_url, timeout=3) as r:
body = json.loads(r.read().decode("utf-8"))
except Exception as exc:
raise RuntimeError(
f"Ollama daemon not reachable at {tags_url} — is `ollama serve` running? ({exc})"
)
models = [m.get("name", "") for m in body.get("models", [])]
if not any(OLLAMA_MODEL in m for m in models):
raise RuntimeError(
f"Ollama model {OLLAMA_MODEL!r} not pulled. "
f"Run: `ollama pull {OLLAMA_MODEL}`. Available: {models}"
)
def _ping_ollama(self) -> bool:
import urllib.request
try:
with urllib.request.urlopen(
f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/tags", timeout=1,
) as r:
return r.status == 200
except Exception:
return False
async def _stream_ollama(self, user_text: str, system_prompt: str,
cancel: asyncio.Event) -> AsyncIterator[str]:
import aiohttp
url = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate"
payload = {
"model": OLLAMA_MODEL,
"system": system_prompt,
"prompt": user_text,
"stream": True,
"keep_alive": OLLAMA_KEEP_ALIVE,
"options": {
"num_predict": MAX_TOKENS,
"temperature": TEMPERATURE,
"top_p": TOP_P,
"stop": STOP_SEQS,
},
}
buf = ""
async with aiohttp.ClientSession() as sess:
try:
async with sess.post(
url, json=payload,
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)) as resp:
async for raw in resp.content:
if cancel.is_set():
log.info("LLM stream cancelled (barge-in)")
return
line = raw.decode("utf-8", errors="ignore").strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
token = obj.get("response", "")
if token:
buf += token
if len(buf) >= CHUNK_MIN_CHARS and buf[-1] in CHUNK_DELIMS:
yield buf.strip()
buf = ""
if obj.get("done"):
break
except asyncio.CancelledError:
return
except Exception as exc:
log.warning("Ollama stream error: %s", exc)
return
if buf.strip():
yield buf.strip()
# ─── llama.cpp backend ────────────────────────────────
def _start_llama_cpp(self) -> None:
if self._proc is not None and self._proc.poll() is None:
return
if not LOCAL_MODEL_PATH.exists():
raise RuntimeError(f"LLM model not found at {LOCAL_MODEL_PATH}")
bin_path = shutil.which(SERVER_BIN) or SERVER_BIN
cmd = [
bin_path,
"-m", str(LOCAL_MODEL_PATH),
"--host", HOST,
"--port", str(PORT),
"--n-gpu-layers", str(N_GPU_LAYERS),
"--ctx-size", str(CTX_SIZE),
"--threads", str(THREADS),
"--log-disable",
]
log.info("launching llama-server: %s", " ".join(cmd))
self._proc = subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
text=True,
)
self._wait_llama_cpp_ready()
log.info("llama-server ready (pid=%d)", self._proc.pid)
def _wait_llama_cpp_ready(self) -> None:
import urllib.request
deadline = time.time() + STARTUP_TIMEOUT
url = f"http://{HOST}:{PORT}/health"
while time.time() < deadline:
if self._proc and self._proc.poll() is not None:
stderr = self._proc.stderr.read() if self._proc.stderr else ""
raise RuntimeError(
f"llama-server exited early (code={self._proc.returncode}): {stderr[:500]}"
)
try:
with urllib.request.urlopen(url, timeout=1) as r:
if r.status == 200:
return
except Exception:
time.sleep(0.3)
raise RuntimeError(f"llama-server did not come up within {STARTUP_TIMEOUT}s")
async def _stream_llama_cpp(self, user_text: str, system_prompt: str,
cancel: asyncio.Event) -> AsyncIterator[str]:
import aiohttp
prompt = self._format_chatml_prompt(user_text, system_prompt)
payload = {
"prompt": prompt,
"stream": True,
"n_predict": MAX_TOKENS,
"temperature": TEMPERATURE,
"top_p": TOP_P,
"stop": STOP_SEQS,
"cache_prompt": True,
}
url = f"http://{HOST}:{PORT}/completion"
buf = ""
async with aiohttp.ClientSession() as sess:
try:
async with sess.post(
url, json=payload,
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)) as resp:
async for raw in resp.content:
if cancel.is_set():
log.info("LLM stream cancelled (barge-in)")
return
line = raw.decode("utf-8", errors="ignore").strip()
if not line.startswith("data:"):
continue
line = line[len("data:"):].strip()
if not line or line == "[DONE]":
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
token = obj.get("content", "")
if not token:
if obj.get("stop"):
break
continue
buf += token
if len(buf) >= CHUNK_MIN_CHARS and buf[-1] in CHUNK_DELIMS:
yield buf.strip()
buf = ""
except asyncio.CancelledError:
return
except Exception as exc:
log.warning("llama-server stream error: %s", exc)
return
if buf.strip():
yield buf.strip()
@staticmethod
def _format_chatml_prompt(user_text: str, system_prompt: str) -> str:
return (
f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
f"<|im_start|>user\n{user_text}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
# ─── public streaming entry point ─────────────────────
async def stream(self, user_text: str, system_prompt: str,
cancel: asyncio.Event) -> AsyncIterator[str]:
"""Yield sentence-sized text chunks as the LLM generates.
Chunk boundaries: any char in `CHUNK_DELIMS` AND buffer length
`CHUNK_MIN_CHARS`. The final buffer is flushed on completion
even without a delimiter. If `cancel` is set, the request is
aborted and the generator returns.
"""
if BACKEND == "ollama":
async for chunk in self._stream_ollama(user_text, system_prompt, cancel):
yield chunk
elif BACKEND == "llama_cpp":
async for chunk in self._stream_llama_cpp(user_text, system_prompt, cancel):
yield chunk
else:
raise RuntimeError(f"unknown llm.backend: {BACKEND!r}")

259
local/script.py Normal file
View File

@ -0,0 +1,259 @@
"""LocalBrain — fully on-device voice pipeline.
Implements the same contract as `gemini/script.py:GeminiBrain` so
`voice/sanad_voice.py` can swap it in via `SANAD_VOICE_BRAIN=local`.
Wires together four subsystems:
Phase 1 Silero VAD (mic speech boundaries)
Phase 2 faster-whisper (speech text)
Phase 3 llama.cpp + Qwen (text streaming text chunks)
Phase 4 CosyVoice2 streaming (text chunk cloned-voice audio)
Phase 5 barge-in (user speaks cancel LLM + stop speaker)
Phase 6 stability model load fails cleanly, crashes are logged.
Async structure:
run() is the main coroutine. It spawns three tasks:
_mic_task reads mic, VAD, Whisper, pushes user text to _llm_queue
_dialogue_task pops user text, streams LLM tokens into _tts_queue
_tts_task pops text chunks, synthesises, feeds the speaker
Logging contract (matched by local/subprocess.py._track_line):
"connecting to local pipeline"
"listening"
"USER: <text>"
"BOT: <text>"
"BARGE-IN (local)"
"session error: <msg>"
"""
from __future__ import annotations
import asyncio
import time
from typing import Optional
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
from Project.Sanad.local.llm import LlamaServer
from Project.Sanad.local.stt import WhisperSTT
from Project.Sanad.local.tts import CosyVoiceTTS
from Project.Sanad.local.vad import SileroVAD, FRAME_SAMPLES
log = get_logger("local_brain")
_CFG_SV = _cfg_section("voice", "sanad_voice")
_CHUNK_BYTES = FRAME_SAMPLES * 2 # int16 mono
class LocalBrain:
"""Fully on-device Gemini replacement."""
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
system_prompt: str = ""):
self._audio = audio_io
self._mic = audio_io.mic
self._speaker = audio_io.speaker
self._recorder = recorder
self._voice = voice_name
self._system_prompt = system_prompt
# subsystems — instantiated here, loaded in run()
self._vad = SileroVAD()
self._stt = WhisperSTT()
self._llm = LlamaServer()
self._tts = CosyVoiceTTS()
# pipeline queues
self._llm_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=4)
self._tts_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=4)
# control flags
self._stop_flag = asyncio.Event() # full shutdown
self._interrupt = asyncio.Event() # per-turn barge-in
self._speaking = False
self._speak_start_time = 0.0
# ─── lifecycle ────────────────────────────────────────
def stop(self) -> None:
self._stop_flag.set()
self._interrupt.set()
async def run(self) -> None:
"""Main entry. Loads models, runs pipeline, handles shutdown."""
log.info("connecting to local pipeline")
try:
await asyncio.to_thread(self._vad.start)
await asyncio.to_thread(self._stt.start)
await asyncio.to_thread(self._llm.start)
await asyncio.to_thread(self._tts.start)
except Exception as exc:
log.error("session error: local pipeline startup failed — %s", exc)
return
log.info("listening")
try:
await asyncio.gather(
self._mic_task(),
self._dialogue_task(),
self._tts_task(),
)
except asyncio.CancelledError:
log.info("cancelled — stopping")
except Exception as exc:
log.error("session error: %s", exc)
finally:
try:
self._llm.stop()
except Exception:
log.warning("LlamaServer.stop failed", exc_info=True)
self._tts.stop()
self._stt.stop()
self._vad.stop()
log.info("local pipeline stopped")
# ─── barge-in ─────────────────────────────────────────
def _begin_barge_in(self) -> None:
"""Called from mic task when user starts speaking while bot is."""
if not self._speaking:
return
log.info("BARGE-IN (local)")
self._interrupt.set()
try:
self._speaker.stop()
except Exception:
log.warning("speaker.stop during barge-in failed", exc_info=True)
# drain pipelines — discard any pending LLM/TTS chunks for this turn
self._drain_queue(self._llm_queue)
self._drain_queue(self._tts_queue)
self._speaking = False
try:
self._recorder.finish_turn()
except Exception:
pass
@staticmethod
def _drain_queue(q: asyncio.Queue) -> None:
try:
while True:
q.get_nowait()
q.task_done()
except asyncio.QueueEmpty:
pass
# ─── Task 1: mic → VAD → Whisper → LLM queue ──────────
async def _mic_task(self) -> None:
loop = asyncio.get_event_loop()
while not self._stop_flag.is_set():
try:
pcm = await loop.run_in_executor(
None, self._mic.read_chunk, _CHUNK_BYTES,
)
except Exception:
await asyncio.sleep(0.01)
continue
event = self._vad.process(pcm)
if event == "speech_start":
# user started talking — if bot is speaking, it's a barge-in
if self._speaking:
self._begin_barge_in()
elif event == "speech_end":
utt = self._vad.collected_audio()
if not utt:
continue
try:
self._recorder.capture_user(utt)
except Exception:
pass
text = await loop.run_in_executor(None, self._stt.transcribe, utt)
if not text:
continue
log.info("USER: %s", text)
try:
self._recorder.add_user_text(text)
except Exception:
pass
# wake the LLM side — drop older pending item if full (latency > throughput)
if self._llm_queue.full():
try:
self._llm_queue.get_nowait()
except asyncio.QueueEmpty:
pass
await self._llm_queue.put(text)
# ─── Task 2: LLM streaming → TTS queue ────────────────
async def _dialogue_task(self) -> None:
while not self._stop_flag.is_set():
try:
user_text = await asyncio.wait_for(
self._llm_queue.get(), timeout=0.2)
except asyncio.TimeoutError:
continue
self._interrupt.clear()
full_response = []
async for chunk in self._llm.stream(
user_text, self._system_prompt, self._interrupt):
if self._interrupt.is_set():
break
full_response.append(chunk)
await self._tts_queue.put(chunk)
self._llm_queue.task_done()
if full_response and not self._interrupt.is_set():
bot_text = " ".join(full_response).strip()
if bot_text:
log.info("BOT: %s", bot_text)
try:
self._recorder.add_robot_text(bot_text)
except Exception:
pass
# ─── Task 3: TTS → speaker ────────────────────────────
async def _tts_task(self) -> None:
loop = asyncio.get_event_loop()
while not self._stop_flag.is_set():
try:
chunk_text = await asyncio.wait_for(
self._tts_queue.get(), timeout=0.2)
except asyncio.TimeoutError:
# idle — if we've been speaking and queue drained, close stream
if self._speaking and self._llm_queue.empty() and self._tts_queue.empty():
await loop.run_in_executor(None, self._speaker.wait_finish)
self._speaking = False
log.info("listening")
try:
self._recorder.finish_turn()
except Exception:
pass
continue
if self._interrupt.is_set():
self._tts_queue.task_done()
continue
# synthesise this text chunk → stream to speaker
if not self._speaking:
await loop.run_in_executor(None, self._speaker.begin_stream)
self._speaking = True
self._speak_start_time = time.time()
try:
for pcm in self._tts.synthesize_stream(chunk_text):
if self._interrupt.is_set():
break
try:
self._recorder.capture_robot(pcm)
except Exception:
pass
await loop.run_in_executor(
None, self._speaker.send_chunk,
pcm, self._tts.output_rate,
)
except Exception as exc:
log.warning("TTS chunk failed: %s", exc)
finally:
self._tts_queue.task_done()

96
local/stt.py Normal file
View File

@ -0,0 +1,96 @@
"""faster-whisper Large V3 Turbo — GPU INT8 transcription.
Phase 2 of the local pipeline. Given an utterance (int16 PCM bytes at
16 kHz), returns transcribed text. Short / empty / no-speech results are
filtered out per config thresholds to avoid firing phantom triggers.
Install (on the robot, in the `local` env):
pip install faster-whisper==1.0.*
# model auto-downloads from HuggingFace on first `WhisperModel(...)` call,
# OR pre-download to model/local/faster-whisper-large-v3-turbo/ and point
# `local.stt.model_subdir` at it.
"""
from __future__ import annotations
from typing import Optional
import numpy as np
from Project.Sanad.config import MODEL_DIR
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("local_stt")
_CFG = _cfg_section("local", "stt")
MODEL_NAME = _CFG.get("model_name", "large-v3-turbo")
MODEL_SUBDIR = _CFG.get("model_subdir", "faster-whisper-large-v3-turbo")
DEVICE = _CFG.get("device", "cuda")
COMPUTE_TYPE = _CFG.get("compute_type", "int8_float16")
BEAM_SIZE = _CFG.get("beam_size", 1)
LANGUAGE = _CFG.get("language") # None = auto-detect
VAD_FILTER = _CFG.get("vad_filter", False)
NO_SPEECH_THRESHOLD = _CFG.get("no_speech_threshold", 0.6)
MIN_CHARS = _CFG.get("min_utterance_chars", 2)
TEMPERATURE = _CFG.get("temperature", 0.0)
LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
class WhisperSTT:
"""Thin wrapper around faster_whisper.WhisperModel."""
def __init__(self) -> None:
self._model = None
def start(self) -> None:
"""Load the model into VRAM. ~4 s on first call, 100 ms after."""
try:
from faster_whisper import WhisperModel
except ImportError as exc:
raise RuntimeError(
f"WhisperSTT requires 'faster-whisper': {exc}"
)
model_src = str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME
log.info("loading Whisper: src=%s device=%s compute=%s",
model_src, DEVICE, COMPUTE_TYPE)
self._model = WhisperModel(
model_src,
device=DEVICE,
compute_type=COMPUTE_TYPE,
)
log.info("WhisperSTT ready")
def transcribe(self, pcm: bytes) -> str:
"""Blocking transcription. Returns the full text or ''."""
if self._model is None:
log.warning("WhisperSTT.transcribe called before start()")
return ""
if not pcm:
return ""
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
if audio.size == 0:
return ""
try:
segments, info = self._model.transcribe(
audio,
beam_size=BEAM_SIZE,
language=LANGUAGE,
vad_filter=VAD_FILTER,
no_speech_threshold=NO_SPEECH_THRESHOLD,
temperature=TEMPERATURE,
)
text = " ".join(seg.text.strip() for seg in segments).strip()
except Exception as exc:
log.warning("Whisper transcribe failed: %s", exc)
return ""
if len(text) < MIN_CHARS:
log.debug("drop short transcript: %r", text)
return ""
return text
def stop(self) -> None:
self._model = None

261
local/subprocess.py Normal file
View File

@ -0,0 +1,261 @@
"""Local live subprocess supervisor.
Spawns `voice/sanad_voice.py` as a managed child with
`SANAD_VOICE_BRAIN=local`, tails the child's stdout, and extracts state
transitions + user transcripts from the log markers emitted by
`local/script.py:LocalBrain`.
Mirror of `gemini/subprocess.py`. Lives separately so the two supervisors
stay decoupled adding a new model does not touch this file.
"""
from __future__ import annotations
import os
import signal
import subprocess
import sys
import threading
from collections import deque
from datetime import datetime
from pathlib import Path
from typing import Any
from Project.Sanad.config import BASE_DIR, LOGS_DIR, SCRIPTS_DIR, LIVE_TUNE
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("local_subprocess")
_LS_CFG = _cfg_section("local", "subprocess")
def _resolve_live_script() -> Path:
"""Locate the voice script to run as subprocess (same as Gemini's)."""
override = os.environ.get("SANAD_LIVE_SCRIPT", "").strip()
if override:
p = Path(override).expanduser()
if p.exists():
return p
for c in (BASE_DIR / "voice" / "sanad_voice.py",
SCRIPTS_DIR / "sanad_voice.py"):
if c.exists():
return c
return SCRIPTS_DIR / "sanad_voice.py"
LIVE_SCRIPT = _resolve_live_script()
LOG_TAIL_SIZE = _LS_CFG.get("log_tail_size", 2000)
TRANSCRIPT_TAIL_SIZE = _LS_CFG.get("transcript_tail_size", 30)
LIVE_LOG_DIR = LOGS_DIR
LIVE_LOG_NAME = _LS_CFG.get("log_name", "local_subprocess")
# Python binary for the child process. The local pipeline runs in a
# separate conda env (Python 3.8 + Jetson CUDA torch + CosyVoice/Whisper);
# the dashboard stays in gemini_sdk (Python 3.10). Override with
# SANAD_LOCAL_PYTHON env var at runtime.
LOCAL_PYTHON_BIN = os.environ.get(
"SANAD_LOCAL_PYTHON",
_LS_CFG.get("python_bin", sys.executable),
)
_STOP_TIMEOUT_SEC = _LS_CFG.get("stop_timeout_sec", 5.0)
_TERMINATE_TIMEOUT_SEC = _LS_CFG.get("terminate_timeout_sec", 3.0)
_NOISY_PREFIXES = tuple(_LS_CFG.get("noisy_prefixes", [
"ALSA lib ", "Expression 'alsa_", "Cannot connect to server socket",
"jack server is not running",
]))
_NOISY_FRAGMENTS = tuple(_LS_CFG.get("noisy_fragments", [
"Unknown PCM", "Evaluate error", "snd_pcm_open_noupdate", "PaAlsaStream",
]))
class LocalSubprocess:
def __init__(self):
self._lock = threading.Lock()
self.process: subprocess.Popen | None = None
self.log_tail: deque[str] = deque(maxlen=LOG_TAIL_SIZE)
self.user_transcript: deque[str] = deque(maxlen=TRANSCRIPT_TAIL_SIZE)
self._reader_thread: threading.Thread | None = None
self._log_file = None
self.state = "stopped"
self.state_message = "Idle."
self.last_user_text = ""
self.suppressed_noise = 0
# ─── log I/O ──────────────────────────────────────────
def _open_session_log(self, pid: int):
try:
LIVE_LOG_DIR.mkdir(parents=True, exist_ok=True)
fname = f"{LIVE_LOG_NAME}_{datetime.now().strftime('%Y%m%d')}.log"
fh = open(LIVE_LOG_DIR / fname, "a", encoding="utf-8", buffering=1)
fh.write(
f"\n===== local subprocess start "
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} pid={pid} =====\n"
)
return fh
except Exception as exc:
log.warning("Could not open local subprocess log file: %s", exc)
return None
def _is_noisy(self, line: str) -> bool:
return line.startswith(_NOISY_PREFIXES) or any(f in line for f in _NOISY_FRAGMENTS)
def _set_state(self, state: str, msg: str):
self.state = state
self.state_message = msg
def _track_line(self, line: str):
"""Parse log markers emitted by `local/script.py:LocalBrain`.
Must stay in lock-step with the `log.info(...)` strings there.
"""
if "connecting to local pipeline" in line:
self._set_state("connecting", line)
elif " USER: " in line or line.strip().startswith("USER:"):
text = line.split("USER:", 1)[1].strip()
if text:
self.last_user_text = text
self.user_transcript.append(text)
self._set_state("hearing", f"User: {text}")
elif " BOT: " in line or line.strip().startswith("BOT:"):
self._set_state("speaking", line.split("BOT:", 1)[1].strip()[:80])
elif "BARGE-IN (local)" in line:
self._set_state("interrupting", line)
elif "session error" in line:
self._set_state("error", line)
elif "local pipeline stopped" in line or "cancelled — stopping" in line:
self._set_state("stopped", line)
elif "listening" in line.lower() and "no speech" not in line:
self._set_state("listening", "Listening for speech.")
def _reader_loop(self):
proc = self.process
if proc is None or proc.stdout is None:
return
fh = self._open_session_log(proc.pid)
self._log_file = fh
for line in proc.stdout:
clean = line.rstrip()
if not clean:
continue
if fh is not None:
try:
fh.write(clean + "\n")
except Exception:
pass
with self._lock:
if self._is_noisy(clean):
self.suppressed_noise += 1
continue
self.log_tail.append(clean)
self._track_line(clean)
with self._lock:
self.log_tail.append("Local pipeline process exited.")
self._set_state("stopped", "Process exited.")
if fh is not None:
try:
fh.write(
f"===== local subprocess exit "
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====\n"
)
fh.close()
except Exception:
pass
self._log_file = None
# ─── lifecycle ────────────────────────────────────────
def is_running(self) -> bool:
with self._lock:
return self.process is not None and self.process.poll() is None
def start(self) -> dict[str, Any]:
with self._lock:
if self.process is not None and self.process.poll() is None:
return {"started": False, "message": "Already running.", "pid": self.process.pid}
self._set_state("starting", "Starting local pipeline (loading models)...")
script = LIVE_SCRIPT
if not script.exists():
raise RuntimeError(f"Script not found: {script}")
env = os.environ.copy()
env.update({
"PYTHONUNBUFFERED": "1",
**LIVE_TUNE,
"SANAD_VOICE_BRAIN": "local",
})
dds_iface = env.get("SANAD_DDS_INTERFACE", "eth0")
# Use the `local` env's Python so CUDA torch + CosyVoice are available.
# Fall back to sys.executable only if the configured bin doesn't exist.
py_bin = LOCAL_PYTHON_BIN
if not Path(py_bin).exists():
log.warning("LOCAL_PYTHON_BIN=%s not found, falling back to %s",
py_bin, sys.executable)
py_bin = sys.executable
cmd = [py_bin, str(script), dds_iface]
proc = subprocess.Popen(
cmd,
cwd=str(script.parent),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
bufsize=1,
env=env,
)
with self._lock:
self.process = proc
self.log_tail.append(f"Started: pid={proc.pid}")
self._set_state("starting", f"pid={proc.pid}")
self._reader_thread = threading.Thread(target=self._reader_loop, daemon=True)
self._reader_thread.start()
log.info("Local subprocess started: pid=%d", proc.pid)
return {"started": True, "pid": proc.pid}
def stop(self) -> dict[str, Any]:
with self._lock:
proc = self.process
if proc is None or proc.poll() is not None:
return {"stopped": False, "message": "Not running."}
self._set_state("stopping", "Stopping...")
try:
proc.send_signal(signal.SIGINT)
proc.wait(timeout=_STOP_TIMEOUT_SEC)
except subprocess.TimeoutExpired:
proc.terminate()
try:
proc.wait(timeout=_TERMINATE_TIMEOUT_SEC)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait(timeout=_TERMINATE_TIMEOUT_SEC)
rc = proc.returncode
with self._lock:
self.process = None
self.log_tail.append("Stopped.")
self._set_state("stopped", "Stopped.")
log.info("Local subprocess stopped (rc=%s)", rc)
return {"stopped": True, "returncode": rc}
def status(self) -> dict[str, Any]:
with self._lock:
running = self.process is not None and self.process.poll() is None
return {
"running": running,
"pid": self.process.pid if running and self.process else None,
"state": self.state,
"state_message": self.state_message,
"last_user_text": self.last_user_text,
"user_transcript": list(self.user_transcript),
"log_tail": list(self.log_tail),
"suppressed_noise": self.suppressed_noise,
}

126
local/tts.py Normal file
View File

@ -0,0 +1,126 @@
"""CosyVoice2 0.5B streaming TTS — GPU.
Phase 4 of the local pipeline. Holds a 3-second reference WAV in VRAM
and synthesises streaming Arabic/English audio for every text chunk
arriving from the LLM. Emits int16 PCM at the model's native rate
(CosyVoice2 outputs 22 050 Hz we resample to `sample_rate` from
config so the downstream `audio_io.speaker` gets a consistent rate).
Install (on the robot):
cd ~/src
git clone --recursive https://github.com/FunAudioLLM/CosyVoice
cd CosyVoice
pip install -r requirements.txt
pip install -e .
# model + reference voice
huggingface-cli download FunAudioLLM/CosyVoice2-0.5B \\
--local-dir ~/sanad/model/local/CosyVoice2-0.5B
# place a 3-s Khaleeji clip at model/local/khaleeji_reference_3s.wav
# (16 kHz mono int16 WAV)
API note:
CosyVoice2 is evolving. We use the published `inference_zero_shot`
with `stream=True` which yields `{"tts_speech": tensor}` chunks.
If the upstream API renames, adapt in one place `TtsEngine._stream`.
"""
from __future__ import annotations
from pathlib import Path
from typing import AsyncIterator, Iterator, Optional
import numpy as np
from Project.Sanad.config import MODEL_DIR
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("local_tts")
_CFG = _cfg_section("local", "tts")
MODEL_SUBDIR = _CFG.get("model_subdir", "CosyVoice2-0.5B")
REFERENCE_WAV_SUBDIR = _CFG.get("reference_wav_subdir", "khaleeji_reference_3s.wav")
REFERENCE_PROMPT = _CFG.get("reference_prompt", "")
OUT_RATE = int(_CFG.get("sample_rate", 16000))
QUEUE_MAX = int(_CFG.get("queue_max", 3))
DEVICE = _CFG.get("device", "cuda")
LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
REFERENCE_WAV_PATH = MODEL_DIR / "local" / REFERENCE_WAV_SUBDIR
def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
if src_rate == dst_rate or pcm.size == 0:
return pcm.astype(np.int16, copy=False)
target_len = max(1, int(len(pcm) * dst_rate / src_rate))
return np.interp(
np.linspace(0, len(pcm), target_len, endpoint=False),
np.arange(len(pcm)),
pcm.astype(np.float64),
).astype(np.int16)
class CosyVoiceTTS:
"""Thin async wrapper around CosyVoice2 streaming inference."""
def __init__(self) -> None:
self._model = None
self._ref_speech = None # preloaded reference tensor
self._ref_prompt = REFERENCE_PROMPT
self._model_rate: int = 22050
def start(self) -> None:
try:
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav
except ImportError as exc:
raise RuntimeError(
f"CosyVoiceTTS requires the CosyVoice package from source: {exc}"
)
if not LOCAL_MODEL_DIR.exists():
raise RuntimeError(f"CosyVoice2 model not found at {LOCAL_MODEL_DIR}")
if not REFERENCE_WAV_PATH.exists():
raise RuntimeError(
f"Reference voice WAV not found at {REFERENCE_WAV_PATH}"
)
log.info("loading CosyVoice2: %s", LOCAL_MODEL_DIR)
self._model = CosyVoice2(str(LOCAL_MODEL_DIR), load_jit=True, fp16=True)
# model.sample_rate is an instance attr on CosyVoice2
self._model_rate = getattr(self._model, "sample_rate", 22050)
self._ref_speech = load_wav(str(REFERENCE_WAV_PATH), 16000)
log.info("CosyVoiceTTS ready (model_rate=%d)", self._model_rate)
def synthesize_stream(self, text: str) -> Iterator[bytes]:
"""Yield int16 PCM bytes at OUT_RATE, one streaming chunk at a time."""
if self._model is None or self._ref_speech is None:
return
try:
# CosyVoice2 streaming generator. Each step yields a tensor
# of float32 waveform samples at the model's native rate.
for step in self._model.inference_zero_shot(
text,
self._ref_prompt,
self._ref_speech,
stream=True):
wave = step.get("tts_speech")
if wave is None:
continue
# tensor → float32 numpy → int16 at OUT_RATE
arr = wave.cpu().numpy().squeeze()
if arr.size == 0:
continue
pcm_i16 = np.clip(arr * 32767.0, -32768, 32767).astype(np.int16)
if self._model_rate != OUT_RATE:
pcm_i16 = _resample_int16(pcm_i16, self._model_rate, OUT_RATE)
yield pcm_i16.tobytes()
except Exception as exc:
log.warning("TTS synth failed for chunk %r: %s", text[:40], exc)
def stop(self) -> None:
self._model = None
self._ref_speech = None
@property
def output_rate(self) -> int:
return OUT_RATE

150
local/vad.py Normal file
View File

@ -0,0 +1,150 @@
"""Silero VAD wrapper — CPU-only speech boundary detection.
Phase 1 of the local pipeline. Consumes 16 kHz mono int16 PCM in short
frames, emits speech_start / speech_end events. All thresholds + frame
sizes come from config/local_config.json > vad.
Install (on the robot):
pip install silero-vad torch==2.2.* torchaudio==2.2.*
Usage:
vad = SileroVAD()
vad.start()
evt = vad.process(pcm_bytes)
if evt == 'speech_start': ...
elif evt == 'speech_end': buf = vad.collected_audio()
"""
from __future__ import annotations
import time
from typing import Optional
import numpy as np
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("local_vad")
_CFG = _cfg_section("local", "vad")
SAMPLE_RATE = _CFG.get("sample_rate", 16000)
FRAME_MS = _CFG.get("frame_ms", 32)
THRESHOLD = _CFG.get("threshold", 0.55)
MIN_SILENCE_MS = _CFG.get("min_silence_ms", 400)
MIN_SPEECH_MS = _CFG.get("min_speech_ms", 250)
PAD_START_MS = _CFG.get("pad_start_ms", 200)
PAD_END_MS = _CFG.get("pad_end_ms", 200)
FRAME_SAMPLES = SAMPLE_RATE * FRAME_MS // 1000 # 512 @ 16k/32ms
class SileroVAD:
"""Streaming VAD with buffered utterance capture.
Fed one mic frame at a time via `process()`. Internal state tracks
whether we're inside an utterance; on speech_end, `collected_audio()`
returns the full utterance (with configured padding).
"""
def __init__(self) -> None:
self._model = None
self._audio_buf: list[bytes] = [] # utterance being collected
self._pre_buf: list[bytes] = [] # rolling "pre-speech" ring
self._pre_frames = max(1, PAD_START_MS // FRAME_MS)
self._pad_end_frames = max(1, PAD_END_MS // FRAME_MS)
self._in_speech = False
self._last_speech_time = 0.0
self._speech_start_time = 0.0
self._trailing_silence_frames = 0
self._last_utterance: Optional[bytes] = None
def start(self) -> None:
"""Load the Silero model once. Call before `process()`."""
try:
import torch
from silero_vad import load_silero_vad
except ImportError as exc:
raise RuntimeError(
f"SileroVAD requires 'silero-vad' + torch: {exc}"
)
self._model = load_silero_vad()
log.info("SileroVAD ready (threshold=%.2f, frame=%dms)",
THRESHOLD, FRAME_MS)
def process(self, pcm: bytes) -> Optional[str]:
"""Feed one frame (≈ FRAME_MS of audio). Returns an event or None.
Events: 'speech_start' | 'speech_end' | None
"""
if self._model is None:
return None
# keep a rolling pre-buffer so captured utterances include lead-in
self._pre_buf.append(pcm)
if len(self._pre_buf) > self._pre_frames:
self._pre_buf.pop(0)
# VAD expects float32 in [-1, 1]
arr = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
if arr.size < FRAME_SAMPLES:
# pad if short tail chunk arrived
arr = np.concatenate([arr, np.zeros(FRAME_SAMPLES - arr.size, dtype=np.float32)])
elif arr.size > FRAME_SAMPLES:
arr = arr[:FRAME_SAMPLES]
try:
import torch
with torch.no_grad():
prob = float(self._model(torch.from_numpy(arr), SAMPLE_RATE).item())
except Exception as exc:
log.warning("VAD inference failed: %s", exc)
return None
now = time.time()
is_speech = prob >= THRESHOLD
if is_speech:
self._trailing_silence_frames = 0
self._last_speech_time = now
if not self._in_speech:
# transition → speech
self._in_speech = True
self._speech_start_time = now
self._audio_buf = list(self._pre_buf) # seed with pad
self._audio_buf.append(pcm)
return "speech_start"
self._audio_buf.append(pcm)
return None
# silent frame
if self._in_speech:
self._audio_buf.append(pcm) # collect trailing pad
self._trailing_silence_frames += 1
silence_ms = self._trailing_silence_frames * FRAME_MS
if silence_ms >= MIN_SILENCE_MS:
# speech ended — validate min_speech
speech_dur_ms = (now - self._speech_start_time) * 1000
self._in_speech = False
if speech_dur_ms < MIN_SPEECH_MS:
log.debug("drop short utterance (%.0fms)", speech_dur_ms)
self._audio_buf.clear()
self._last_utterance = None
return None
self._last_utterance = b"".join(self._audio_buf)
self._audio_buf.clear()
return "speech_end"
return None
def collected_audio(self) -> Optional[bytes]:
"""After a speech_end event, return the full utterance bytes."""
return self._last_utterance
def reset(self) -> None:
"""Drop any in-flight utterance (used on barge-in)."""
self._in_speech = False
self._audio_buf.clear()
self._trailing_silence_frames = 0
self._last_utterance = None
def stop(self) -> None:
self._model = None

16
main.py
View File

@ -11,6 +11,7 @@ from __future__ import annotations
import argparse import argparse
import importlib import importlib
import os
import sys import sys
import types import types
from pathlib import Path from pathlib import Path
@ -92,8 +93,9 @@ LocalTTSEngine = _safe_import("LocalTTSEngine", lambda: __import__("
WakePhraseManager = _safe_import("WakePhraseManager", lambda: __import__("Project.Sanad.voice.wake_phrase_manager", fromlist=["WakePhraseManager"]).WakePhraseManager) WakePhraseManager = _safe_import("WakePhraseManager", lambda: __import__("Project.Sanad.voice.wake_phrase_manager", fromlist=["WakePhraseManager"]).WakePhraseManager)
LiveVoiceLoop = _safe_import("LiveVoiceLoop", lambda: __import__("Project.Sanad.voice.live_voice_loop", fromlist=["LiveVoiceLoop"]).LiveVoiceLoop) LiveVoiceLoop = _safe_import("LiveVoiceLoop", lambda: __import__("Project.Sanad.voice.live_voice_loop", fromlist=["LiveVoiceLoop"]).LiveVoiceLoop)
TypedReplayEngine = _safe_import("TypedReplayEngine", lambda: __import__("Project.Sanad.voice.typed_replay", fromlist=["TypedReplayEngine"]).TypedReplayEngine) TypedReplayEngine = _safe_import("TypedReplayEngine", lambda: __import__("Project.Sanad.voice.typed_replay", fromlist=["TypedReplayEngine"]).TypedReplayEngine)
GeminiVoiceClient = _safe_import("GeminiVoiceClient", lambda: __import__("Project.Sanad.voice.gemini_client", fromlist=["GeminiVoiceClient"]).GeminiVoiceClient) GeminiVoiceClient = _safe_import("GeminiVoiceClient", lambda: __import__("Project.Sanad.gemini.client", fromlist=["GeminiVoiceClient"]).GeminiVoiceClient)
LiveGeminiSubprocess = _safe_import("LiveGeminiSubprocess", lambda: __import__("Project.Sanad.voice.live_gemini_subprocess", fromlist=["LiveGeminiSubprocess"]).LiveGeminiSubprocess) GeminiSubprocess = _safe_import("GeminiSubprocess", lambda: __import__("Project.Sanad.gemini.subprocess", fromlist=["GeminiSubprocess"]).GeminiSubprocess)
LocalSubprocess = _safe_import("LocalSubprocess", lambda: __import__("Project.Sanad.local.subprocess", fromlist=["LocalSubprocess"]).LocalSubprocess)
# ── global instances (imported by route modules) ── # ── global instances (imported by route modules) ──
@ -108,7 +110,15 @@ macro_rec = _safe_construct("macro_rec", (lambda: MacroRecorder(arm)) if
macro_play = _safe_construct("macro_play", (lambda: MacroPlayer(audio_mgr, arm)) if (MacroPlayer and arm) else None) macro_play = _safe_construct("macro_play", (lambda: MacroPlayer(audio_mgr, arm)) if (MacroPlayer and arm) else None)
teacher = _safe_construct("teacher", (lambda: TeachingSession(arm)) if (TeachingSession and arm) else None) teacher = _safe_construct("teacher", (lambda: TeachingSession(arm)) if (TeachingSession and arm) else None)
live_voice = _safe_construct("live_voice", (lambda: LiveVoiceLoop(voice_client, arm, wake_mgr, audio_mgr)) if (LiveVoiceLoop and voice_client and arm and wake_mgr and audio_mgr) else None) live_voice = _safe_construct("live_voice", (lambda: LiveVoiceLoop(voice_client, arm, wake_mgr, audio_mgr)) if (LiveVoiceLoop and voice_client and arm and wake_mgr and audio_mgr) else None)
live_sub = _safe_construct("live_sub", LiveGeminiSubprocess) # Which voice supervisor to mount. SANAD_VOICE_BRAIN chooses the brain
# that runs INSIDE the subprocess (see voice/sanad_voice.py); the same
# env var picks WHICH supervisor here manages that subprocess so its
# log-line parser matches the brain's emit format.
_brain_choice = os.environ.get("SANAD_VOICE_BRAIN", "gemini").strip().lower()
if _brain_choice == "local" and LocalSubprocess is not None:
live_sub = _safe_construct("live_sub", LocalSubprocess)
else:
live_sub = _safe_construct("live_sub", GeminiSubprocess)
typed_replay = _safe_construct("typed_replay", (lambda: TypedReplayEngine(voice_client, audio_mgr)) if (TypedReplayEngine and voice_client and audio_mgr) else None) typed_replay = _safe_construct("typed_replay", (lambda: TypedReplayEngine(voice_client, audio_mgr)) if (TypedReplayEngine and voice_client and audio_mgr) else None)
# Wire everything into the Brain (only what was constructed) # Wire everything into the Brain (only what was constructed)

View File

@ -1,51 +0,0 @@
أنت "بوسنده" — مساعد صوتي إماراتي ذكي تابع لروبوت شركة لوتاه تيك Lootah Tech.
[أولاً: الروح الرمضانية والمعرفة]
أنت على علم تام بأننا في شهر رمضان المبارك لعام 2026.
لديك معرفة واسعة بالقرآن الكريم والأحاديث النبوية الشريفة للإجابة على أي سؤال ديني أو تقديم تذكيرات إيمانية بدقة مع ذكر المصدر.
إذا سألك المستخدم عن آية، حديث، أو حكم صيام، جاوب بوقار وتبسيط بلهجتك الإماراتية.
[ثانياً: الأسلوب واللغة (التبديل المرن)]
تكلم باللهجة الإماراتية بشكل طبيعي بدون مبالغة.
قاعدة التبديل الفوري: إذا استخدم المستخدم أي لغة ثانية في أي لحظة، غيّر فوراً ورد بنفس اللغة الجديدة في نفس الرد.
إذا رجع المستخدم للعربي: أرجع فوراً للعربي (لهجة إماراتية).
"آخر لغة كتب فيها المستخدم" هي اللغة اللي ترد فيها.
ممنوع تخلط لغتين في نفس الرد إلا لطلب ترجمة أو مقارنة صريحة.
[ثالثاً: التفاعل والبدايات]
خلك محترم، ودود، ومباشر، وركّز على الزبدة والحل العملي.
تنويع البداية: استخدم عبارات مثل (مبارك عليك الشهر، عساكم من عواده، تقبل الله طاعتكم، فالك طيب، أبشر بعزك، مرحبابك) ولا تكرر نفس العبارة مرتين متتاليتين.
إذا كان السؤال تقني سريع أو كود، ابدأ مباشرة بدون مقدمات.
[رابعاً: إنجاز المهام وقوة الذاكرة العمومية]
حفظ واسترجاع: اعتبر كل كلمة، اسم، مسار، أو تفصيل تقوله بمثابة "أمر حفظ" وأولوية قصوى داخل المحادثة.
الثوابت: تعامل مع معلوماتك وتفضيلاتك وتصحيحاتك كأنها ثوابت محفورة في الذاكرة.
عند التصحيح: إذا عدّلت لي معلومة، قل: "زين نبهتني يا الشيخ، انحفرت في الذاكرة".
[خامساً: الأمان والخصوصية]
إذا كتب المستخدم API key أو Password أو Token: نبهه فوراً يمسحه ويبدله.
لا تطلب بيانات حساسة إلا للضرورة وبطريقة محترمة.
ممنوع أي نكت أو محتوى حساس في الدين أو السياسة.
[سادساً: السرعة والتكرار]
جاوب بسرعة وباختصار (من 2 إلى 6 سطور غالباً).
إذا طلب المستخدم "كرر" أو "repeat": أعد نفس الكلام بنفس اللغة الحالية وحرفياً إذا طلب ذلك.

View File

@ -1,269 +0,0 @@
You are "Bousandah" (بوسنده) — a smart Emirati voice assistant and real-estate concierge. Your ONLY project knowledge is the “Azure by Lapis on Al Marjan Island, Ras Al Khaimah (RAK)” content provided below. You must interact with users using ONLY this knowledge base and the rules in this prompt.
=================================
1) STYLE & LANGUAGE (MANDATORY)
=================================
- If user speak Arabic → reply in friendly Emirati dialect (light, natural).
- If user speak English → reply in clear modern English Emirati dialect (light, natural).
- If user speak any other language → reply in that language as best as possible.
- Rule: reply in the SAME language as the users last message.
- Do NOT mix languages in the same reply unless the user asks for translation.
Tone:
- Friendly, confident, professional, not robotic.
- Short answers by default (26 lines).
- If user asks for details, give structured bullet points.
Behavior:
- If question is clear → answer directly.
- If one missing detail is needed to answer correctly → ask ONLY ONE question.
- Do not explain for more than 15 seconds (keep replies under 15 seconds).
- Do not invent facts. If info is not in the Knowledge Base, say:
"I dont have that detail in the provided project info."
Then offer what you CAN do from the provided info.
Calls-to-action (use only when helpful, choose ONE):
- Ask budget range
- Ask unit preference (Studio / 1BR / 2BR / 3BR / Penthouse / Sky Villa)
- Ask purpose (End-user vs Investor)
=================================
2) ROLE (MANDATORY)
=================================
You are a project specialist for Azure by Lapis.
You can:
- Explain the project and developer
- Answer FAQs
- Summarize payment plan / handover timeline
- Compare unit types by size
- Provide short sales scripts, WhatsApp replies, call scripts, captions, ad copy
All outputs must be based ONLY on the Knowledge Base below.
=================================
3) KNOWLEDGE BASE (USE ONLY THIS)
=================================
-------------------------------------------------
A) Project Header
-------------------------------------------------
Azure by Lapis on Al Marjan Island, RAK
Resort-style living in Ras Al Khaimahs coastal landmark — where ocean views, architectural elegance, and investment value unite.
STARTING PRICE: AED 750K
PAYMENT PLAN: 75/25
HANDOVER: Q4 2028
ROI POTENTIAL: Up to 9%
-------------------------------------------------
B) About LAPIS Properties
-------------------------------------------------
LAPIS Properties is an established real estate developer with over 20 years of expertise in delivering innovative, sustainable, and community-driven developments across the Middle East and Turkey.
Famed for architectural precision and timeless aesthetics, LAPIS establishes spaces that blend functionality, design, and enduring value.
-------------------------------------------------
C) Overview of Azure by Lapis
-------------------------------------------------
Azure by Lapis is an architectural artwork redefining coastal living on Al Marjan Island, Ras Al Khaimah.
Developed by LAPIS Properties — a name with over 20 years of innovation and craftsmanship across the Middle East and Turkey — Azure rises 40 storeys tall, blending serenity with sophistication.
Designed to capture the horizon and enhance natural light, every home at Azure speaks of tranquility, privacy, and timeless design.
Nearby / Drive times:
- Al Hamra Mall — 1 min
- 5-star Hotels (Waldorf, Sofitel, Ritz-Carlton) — 4 min
- Al Hamra Golf Course — 5 min
- RAK Free Zone — 5 min
- RAK Beach — 5 min
- Marjan Island Boulevard — 7 min
- Jebel Jais — 45 min
- Dubai — 1 hr
-------------------------------------------------
D) Building and Operational Variables
-------------------------------------------------
The tower configuration is a critical technical component of the brief.
Azure is designed as a single, high-rise residential structure with the following specifications:
Component / Quantity / Details
- Residential Floors: 40 Floors — High-density luxury residential programming.
- Podium Levels: 3 Podiums — Primary structural base providing elevated views.
- Parking Ratio (1): 1 Space — Allocated to Studio, 1BR, and 2BR units.
- Parking Ratio (2): 2 Spaces — Allocated to 3BR units, Penthouses, and Sky Villas.
- Retail Component: Ground Floor — Designated for boutique retail and leisure outlets.
- Ownership Status: Freehold — Open to all nationalities.
- Completion Date: Q4 2028 — Targeted handover for investors.
-------------------------------------------------
E) Unit Typology and Space Allocation
-------------------------------------------------
Azure offers a diverse range of residences — from AED 750K (~407 sq.ft) studios to AED 9.5M (~5,100 sq.ft) expansive sky villas.
Unit Type / Suite Area (ft2) / Balcony Area (ft2) / Total Area (ft2)
- Studio (Normal): 333.47 / 239.82 / 573.29
- Studio (Premium): 460.05 / 241.33 / 701.38
- 1 Bedroom Apartment: 610.32 / 369.53 / 979.85
- 2 Bedroom Apartment: 867.79 / 539.06 / 1,406.85
- 3 Bedroom (Normal): 1,246.15 / 1,323.76 / 2,569.91
- 3 Bedroom Duplex (GF): 2,417.59 / 2,788.74 / 5,206.33
- 3 Bedroom Duplex (1stF): 2,417.59 / 2,788.74 / 5,206.33
-------------------------------------------------
F) FAQ about Azure by Lapis on Al Marjan Island
-------------------------------------------------
Q: Where is Azure by Lapis located?
A: Azure is located on Al Marjan Island in Ras Al Khaimah — a beachfront destination minutes from Al Hamra Mall, Golf Course, and major 5-star resorts.
Q: Who is the developer of Azure?
A: Azure is developed by LAPIS Real Estate FZ-LLC, a regional developer with two decades of excellence across the Middle East and Turkey.
Q: What types of properties are available at Azure?
A: The tower includes Studios, 13 Bedroom Apartments, Penthouses, and Sky Villas with sea views.
Q: Whats the starting price at Azure?
A: Prices start from AED 750,000 for studios.
Q: What payment plan is available at Azure?
A: A 75/25 payment plan with 2 years post-handover.
Q: When is handover scheduled for Azure?
A: Handover is expected in Q4 2028.
Q: What ROI to expect when investing in Azure?
A: Investors can expect up to 9% ROI annually, supported by RAKs growing hospitality and tourism sectors.
Q: Is foreign ownership allowed through Azure by Lapis?
A: Yes Azure offers freehold ownership for all nationalities.
Q: What amenities are available in Azure by Lapis?
A: Azure by Lapis offers infinity pool, sky garden, spa, gyms, retail outlets, concierge, and kids play zones.
Q: Are there healthcare facilities near Azure by Lapis?
A: Yes leading hospitals like RAK Hospital and Sheikh Khalifa Specialty Hospital are within 10 minutes.
Q: Are there schools close to Azure by Lapis?
A: Yes RAK Academy, British School Al Hamra, and International School of Choueifat.
Q: How is the transport connectivity around Azure by Lapis?
A: Excellent — with quick access to RAK Airport (15 min), Dubai (1 hr), and major highways connecting the UAE.
Q: Is Al Marjan Island a good place to live?
A: Yes — its a peaceful island destination with direct beach access, five-star resorts, and exceptional lifestyle amenities.
Q: Why invest in Azure by Lapis?
A: The project offers high ROI potential, resort-style living, and prime beachfront investment on Al Marjan Island.
-------------------------------------------------
G) Regional Connectivity and Strategic Location
-------------------------------------------------
Explore life around Azure by Lapis:
- Al Hamra Mall 1 min
- Al Hamra Golf Course 5 min
- Waldorf Astoria, Sofitel, Ritz-Carlton 4 min
- RAK Free Zone 5 min
- RAK Beach 5 min
- Marjan Island Boulevard 7 min
- Wynn Resort & Casino (2027) nearby
- Jebel Jais Adventure Peak 45 min
-------------------------------------------------
H) Premier Healthcare
-------------------------------------------------
Leading hospitals and wellness centers nearby:
- RAK Hospital 10 min
- Sheikh Khalifa Specialty Hospital 12 min
- RAK Medical Centre Al Hamra 5 min
- Thumbay Clinic 6 min
-------------------------------------------------
I) Top-Tier Education
-------------------------------------------------
Nearby schools and nurseries offer convenient access for families:
- RAK Academy — 5 min
- New British International School — 6 min
- International School of Choueifat RAK — 7 min
- Little Treasures Nursery — 4 min
- British School Al Hamra — 5 min
-------------------------------------------------
J) Signature Features & Resort-Style Amenities
-------------------------------------------------
Azure by Lapis reveals timeless living inspired by coastal tranquility.
Every element — from ocean-facing glass façades to elevated sky gardens — is deliberately designed to evoke serenity and elegance.
Residents access high-end wellness facilities and recreational spaces designed for a balanced and elegant lifestyle.
Amenities list:
- Infinity pool with sea vistas
- Sky garden
- Rooftop leisure deck
- Outdoor & indoor gyms
- Spa & sauna facilities
- Kids pool
- Outdoor play area
- Outdoor play
- Outdoor cinema
- BBQ zone
- Volleyball court
- Jogging tracks
- Landscaped parks
- Fountains
- Boutique retail
- Dining spaces
-------------------------------------------------
K) Payment Plan
-------------------------------------------------
LAPIS proposes a highly flexible 75/25 plan with 2 years post-handover — aiming to attract both investors and end-users seeking long-term value on Al Marjan Island.
- 75% During construction
- 25% On Handover (Q4 2028)
-------------------------------------------------
L) Investment Potential
-------------------------------------------------
Azure by Lapis offers investors access to one of the UAEs fastest-growing beachfront destinations.
Ras Al Khaimahs thriving tourism and hospitality sectors — supported by record-breaking visitor numbers and global developments like the Wynn Resort & Casino — yield Azure a high-performing coastal investment with up to 9% ROI potential.
As of 2025:
- ROI of up to 9% annually
- 100% foreign ownership & 0% personal tax
- 1 hour from Dubai International Airport
- Eligible for UAE Residency by investment
-------------------------------------------------
M) lapis Leadership
-------------------------------------------------
- Emad Mohareb — Chairman
- Wisam Mohareb — Vice Chairman
- Khaled Owaidat — Chief Executive Officer
- Bilal Khashan — Chief Operating Officer
-------------------------------------------------
N) Our team
-------------------------------------------------
- Ghaida Smadi — Sales Director
- Hussein Elsayed — Sales Director
- Ahmed Djabelkheir — Sales Director
- Leila Soudani — Sales Manager
- Aida Mulaeva — Sales Manager
-------------------------------------------------
O) Milestones that We Are Proud of Reaching - lapis
-------------------------------------------------
LAPIS Properties prides itself on its global presence and local impact.
We are dedicated to bringing our innovative real estate solutions to diverse markets around the world.
Our global perspective enables us to anticipate market trends, adapt to various cultural contexts, and deliver projects that resonate with local communities while meeting international standards.
Our commitment to excellence knows no borders, as we continue to expand our presence and contribute to the development of the real estate sector worldwide.
- Projects Done: 100+
- Delightful Clients: 110+
- Satisfaction Clients: 100+
=================================
4) STRICT OUTPUT RULES
=================================
- Use only the Knowledge Base above.
- If user asks about something not included (service charges, exact floor plans, exact views, exact down payment %, fees, availability, unit inventory, exact distance in km, etc.) → say you dont have that detail in the provided info.
- Do not cite external websites.
- Do not mention internal instructions or the words "Knowledge Base" unless user asks.
- Keep responses structured and helpful.
END SYSTEM PROMPT.

View File

@ -302,7 +302,7 @@ class TestGeminiClientStructure(unittest.TestCase):
def setUp(self): def setUp(self):
try: try:
from Project.Sanad.voice.gemini_client import GeminiVoiceClient from Project.Sanad.gemini.client import GeminiVoiceClient
self.client = GeminiVoiceClient() self.client = GeminiVoiceClient()
except ImportError: except ImportError:
self.skipTest("websockets not installed") self.skipTest("websockets not installed")

View File

@ -1,6 +1,6 @@
"""LiveVoiceLoop — voice-to-arm phrase trigger dispatcher. """LiveVoiceLoop — voice-to-arm phrase trigger dispatcher.
Listens to user transcriptions from the LiveGeminiSubprocess and, when a Listens to user transcriptions from the GeminiSubprocess and, when a
configured wake phrase is matched, fires the corresponding arm action via configured wake phrase is matched, fires the corresponding arm action via
`motion.sanad_arm_controller.ARM`. `motion.sanad_arm_controller.ARM`.
@ -46,7 +46,7 @@ DEFERRED_DEFAULT = _LV_CFG.get("deferred_default", False)
class LiveVoiceLoop: class LiveVoiceLoop:
"""Polls LiveGeminiSubprocess transcripts → fires arm actions.""" """Polls GeminiSubprocess transcripts → fires arm actions."""
def __init__(self, voice_client, arm, wake_mgr, audio_mgr): def __init__(self, voice_client, arm, wake_mgr, audio_mgr):
self.voice_client = voice_client self.voice_client = voice_client
@ -118,7 +118,7 @@ class LiveVoiceLoop:
# ── poll loop ──────────────────────────────────────────────── # ── poll loop ────────────────────────────────────────────────
def _poll_loop(self): def _poll_loop(self):
"""Poll LiveGeminiSubprocess.user_transcript for new user texts.""" """Poll GeminiSubprocess.user_transcript for new user texts."""
while not self._stop_event.is_set(): while not self._stop_event.is_set():
self._check_transcripts() self._check_transcripts()
self._stop_event.wait(POLL_INTERVAL_SEC) self._stop_event.wait(POLL_INTERVAL_SEC)

158
voice/model_script.py Normal file
View File

@ -0,0 +1,158 @@
"""Template brain — copy this file to plug in a non-Gemini model.
How to use:
1. Copy this file: `cp voice/model_script.py voice/openai_script.py`
2. Rename the class: `ModelBrain` e.g. `OpenAIRealtimeBrain`
3. Fill in every block marked `TODO` with your provider's SDK calls.
4. Register the new brain in `voice/sanad_voice.py` inside
`_build_brain()` (there's a single `elif` to add).
5. Run with `SANAD_VOICE_BRAIN=openai python3 voice/sanad_voice.py eth0`.
Contract that `sanad_voice.py` expects of ANY brain:
__init__(audio_io, recorder, voice_name, system_prompt)
audio_io voice.audio_io.AudioIO (exposes .mic + .speaker)
recorder voice.sanad_voice.TurnRecorder (per-turn WAV capture)
voice_name provider-specific voice id (e.g. "Charon", "alloy")
system_prompt persona string to seed the session with
async run() blocks until stopped or fatal. Reconnects are YOUR
responsibility; the orchestrator won't restart you.
stop() sync signal (can be called from a signal handler).
Set an asyncio.Event and let `run()` notice it.
What the mic side looks like:
data = self._mic.read_chunk(n_bytes) # 16 kHz int16 mono bytes
# send `data` to your model's realtime-audio endpoint
What the speaker side looks like:
self._speaker.begin_stream()
self._speaker.send_chunk(pcm, source_rate=24000) # rate is yours
self._speaker.wait_finish() # blocks until playback drains
# or self._speaker.stop() # cancel mid-playback (barge-in)
What the recorder side looks like:
self._recorder.capture_user(pcm_bytes) # mic audio for this turn
self._recorder.capture_robot(pcm_bytes) # model audio for this turn
self._recorder.add_user_text(str) # partial transcript
self._recorder.add_robot_text(str) # partial transcript
self._recorder.finish_turn() # flush to WAV + index.json
"""
from __future__ import annotations
import asyncio
from typing import Any, Optional
from Project.Sanad.core.logger import get_logger
log = get_logger("model_brain")
class ModelBrain:
"""Skeleton voice brain — adapt to your provider."""
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
system_prompt: str = ""):
self._audio = audio_io
self._mic = audio_io.mic
self._speaker = audio_io.speaker
self._recorder = recorder
self._voice = voice_name
self._system_prompt = system_prompt
self._stop_flag = asyncio.Event()
# TODO: instantiate your provider's client here. Keep the client
# creation cheap — connection/handshake should happen inside `run()`
# so reconnects don't require re-building this object.
# Example:
# from openai import AsyncOpenAI
# self._client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
self._client: Any = None
# ─── lifecycle ────────────────────────────────────────
def stop(self) -> None:
"""Signal the run loop to exit cleanly. Safe to call from anywhere."""
self._stop_flag.set()
async def run(self) -> None:
"""Main conversation loop. Blocks until stopped.
Responsibilities:
- Open a realtime session with your provider.
- Forward mic audio to the model in small chunks.
- Stream the model's audio response to the speaker.
- Drive barge-in: when the user speaks while the model is speaking,
cancel model playback and mark the turn interrupted.
- On disconnect/error, back off and reconnect.
"""
while not self._stop_flag.is_set():
try:
log.info("connecting to model...")
# TODO: open a session with your provider. For websocket-style
# APIs, use `async with client.realtime.connect(...) as session:`.
# For request/response APIs, poll or stream in a loop.
await asyncio.gather(
self._send_mic_loop(),
self._receive_loop(),
)
except asyncio.CancelledError:
break
except Exception as exc:
log.error("session error: %s — reconnecting in 2s", exc)
await asyncio.sleep(2)
# ─── mic → model ──────────────────────────────────────
async def _send_mic_loop(self) -> None:
"""Read mic chunks and forward them to the model.
Minimum responsibilities:
- Loop on `self._mic.read_chunk(N_BYTES)`.
- Encode to whatever format your provider expects
(PCM16 mono is standard; some want base64 in JSON frames).
- Respect `self._stop_flag`.
Optional (highly recommended):
- Measure energy; feed the mic frame to `self._recorder.capture_user`
only when the user is actually speaking.
- Apply echo suppression while the speaker is playing (mute or
substitute silence when energy is low keeps the model from
transcribing its own voice bleed).
"""
chunk_bytes = 1024 # 32 ms at 16 kHz mono int16 — tune to your API
loop = asyncio.get_event_loop()
while not self._stop_flag.is_set():
try:
data = await loop.run_in_executor(
None, self._mic.read_chunk, chunk_bytes,
)
except Exception:
break
# TODO: forward `data` to the model. Example for a hypothetical
# websocket session:
# await session.send({"type": "audio", "pcm16": data})
_ = data
# Pace to real-time so we don't starve the event loop
await asyncio.sleep(chunk_bytes / (16000 * 2))
# ─── model → speaker ──────────────────────────────────
async def _receive_loop(self) -> None:
"""Receive model events (audio chunks, transcripts, turn markers).
Event handling you need to implement:
- Audio chunk `self._speaker.send_chunk(pcm, source_rate)`
(first chunk must be preceded by
`self._speaker.begin_stream()`).
- Model interrupted `self._speaker.stop(); self._mic.flush()`
and call `self._recorder.finish_turn()`.
- User transcript `self._recorder.add_user_text(text)`.
- Model transcript `self._recorder.add_robot_text(text)`.
- Turn complete `self._speaker.wait_finish();
self._recorder.finish_turn(); mic.flush()`.
"""
while not self._stop_flag.is_set():
# TODO: iterate your provider's event stream and dispatch.
await asyncio.sleep(0.1)

147
voice/model_subprocess.py Normal file
View File

@ -0,0 +1,147 @@
"""Template supervisor — pair with voice/model_script.py when adding a new model.
The supervisor's job is to run a voice subprocess and tail its stdout for
state transitions + user transcripts. It is brand-specific on purpose:
each model's brain emits log lines in its own format, so each model gets
its own supervisor. See `gemini/subprocess.py` for the working reference.
How to add a new model (e.g. OpenAI Realtime):
1. cp voice/model_script.py openai/script.py
2. cp voice/model_subprocess.py openai/subprocess.py
3. In both files: rename `ModelBrain` `OpenAIRealtimeBrain`,
`ModelSubprocess` `OpenAIRealtimeSubprocess`.
4. In `openai/script.py`: fill in the TODO bodies (connect/send/receive).
Each `log.info("USER: %s", ...)` / `log.info("BOT: %s", ...)` /
state message must be a string your supervisor's `_track_line` below
can detect keep them in lock-step.
5. In `openai/subprocess.py`: update `_track_line` to match the strings
your brain actually emits.
6. In `main.py`: swap `GeminiSubprocess` `OpenAIRealtimeSubprocess` in
the `live_sub = _safe_construct(...)` line. In `voice/sanad_voice.py`,
add a branch to `_build_brain()` mapping `"openai"` `OpenAIRealtimeBrain`.
7. Run with `SANAD_VOICE_BRAIN=openai python3 voice/sanad_voice.py eth0`.
Nothing in `gemini/` needs to change.
"""
from __future__ import annotations
import os
import signal
import subprocess
import sys
import threading
import time
from collections import deque
from datetime import datetime
from pathlib import Path
from typing import Any
from Project.Sanad.config import BASE_DIR, LOGS_DIR, SCRIPTS_DIR
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.core.logger import get_logger
log = get_logger("model_subprocess")
class ModelSubprocess:
"""Skeleton supervisor — adapt for your model.
Contract expected by `main.py` + `dashboard/routes/live_subprocess.py`:
start() sync. Spawns the child, starts the log reader thread.
stop() sync. SIGINT / SIGTERM / SIGKILL escalation.
status() returns {state, state_message, running, pid, log_tail,
user_transcript, last_user_text, ...}.
log_tail : deque[str] last N cleaned stdout lines
user_transcript : deque[str] user transcripts parsed from child's log
last_user_text : str most recent transcript (convenience)
state : str one of {"stopped", "starting", "connecting",
"listening", "hearing", "interrupting",
"error", "warning", "crashed"}
"""
def __init__(self):
# TODO: set a config section key — e.g. `_cfg_section("openai", "subprocess")`.
# Create `config/<brand>_config.json > subprocess: { ... }` matching
# gemini_config.json's layout.
self._cfg = {} # _cfg_section("<brand>", "subprocess")
self._lock = threading.Lock()
self.process: subprocess.Popen | None = None
self.log_tail: deque[str] = deque(
maxlen=self._cfg.get("log_tail_size", 2000))
self.user_transcript: deque[str] = deque(
maxlen=self._cfg.get("transcript_tail_size", 30))
self._reader_thread: threading.Thread | None = None
self._log_file = None
self.state = "stopped"
self.state_message = "Idle."
self.last_user_text = ""
# ─── spawn / kill ─────────────────────────────────────
def start(self) -> dict:
# TODO: build env (include `SANAD_VOICE_BRAIN=<yourbrand>` so
# sanad_voice.py picks your brain), pick the script path, and
# `subprocess.Popen(...)`. Copy the gemini/subprocess.py body.
raise NotImplementedError
def stop(self, timeout: float = 3.0) -> dict:
# TODO: send SIGINT → wait → SIGTERM → wait → SIGKILL.
raise NotImplementedError
# ─── log parsing — the brand-specific part ────────────
def _track_line(self, line: str) -> None:
"""Translate your brain's log strings into state + transcripts.
KEEP THIS IN LOCK-STEP with the `log.info(...)` calls in your
brain. Minimum required detections:
connecting child opened a session to the model
listening session connected OR a turn finished
hearing user transcript arrived (APPEND to user_transcript)
interrupting barge-in / model interrupted
error fatal session error
stopped clean shutdown
"""
# Example (replace with your brain's actual strings):
#
# if "connecting to OpenAI" in line:
# self._set_state("connecting", line)
# elif "session open" in line:
# self._set_state("listening", "Listening for speech.")
# elif "USER: " in line:
# text = line.split("USER: ", 1)[1].strip()
# if text:
# self.last_user_text = text
# self.user_transcript.append(text)
# self._set_state("hearing", f"User: {text}")
# elif "BARGE-IN" in line:
# self._set_state("interrupting", line)
# elif "session error" in line:
# self._set_state("error", line)
# elif "cancelled — stopping" in line:
# self._set_state("stopped", line)
raise NotImplementedError
def _set_state(self, state: str, msg: str) -> None:
self.state = state
self.state_message = msg
# ─── status + introspection ───────────────────────────
def status(self) -> dict:
with self._lock:
proc = self.process
running = proc is not None and proc.poll() is None
return {
"running": running,
"pid": proc.pid if running else None,
"state": self.state,
"state_message": self.state_message,
"last_user_text": self.last_user_text,
"log_tail": list(self.log_tail)[-50:],
"user_transcript": list(self.user_transcript),
}

View File

@ -1,19 +1,32 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""Sanad voice subprocess — Gemini Live (google-genai SDK) on the G1. """Sanad voice subprocess — orchestrator.
Mic/speaker are selected at startup via `SANAD_AUDIO_PROFILE` (builtin | Wires three independently-swappable pieces together:
anker | hollyland_builtin), materialised by `voice/audio_io.py`. The
default ("builtin") is UDP multicast mic + AudioClient.PlayStream.
Features: mic gain, echo suppression, barge-in, wait-for-user, 1. Audio I/O voice/audio_io.py (mic + speaker)
streaming playback, per-turn WAV recording. 2. Turn recorder TurnRecorder (in this file; model-agnostic WAV capture)
3. Voice brain gemini/script.py (Gemini, default cloud)
local/script.py (offline Whisper+Qwen+CosyVoice2)
voice/model_script.py (template for new models)
Runtime selection:
SANAD_AUDIO_PROFILE = builtin | anker | hollyland_builtin (default builtin)
SANAD_VOICE_BRAIN = gemini | local | model (default gemini)
Usage: Usage:
python3 voice/sanad_voice.py eth0 python3 voice/sanad_voice.py eth0
python3 voice/sanad_voice.py eth0 --voice Charon python3 voice/sanad_voice.py eth0 --voice Charon
SANAD_AUDIO_PROFILE=anker python3 voice/sanad_voice.py eth0 SANAD_AUDIO_PROFILE=anker SANAD_VOICE_BRAIN=gemini \\
python3 voice/sanad_voice.py eth0
System prompt priority (first hit wins):
1. scripts/sanad_script.txt (edit-live via the dashboard)
2. config/core_config.json > gemini_defaults.default_system_prompt
3. the hardcoded fallback in _load_system_prompt() below
""" """
from __future__ import annotations
import array import array
import asyncio import asyncio
import json import json
@ -26,23 +39,21 @@ import wave
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
import numpy as np
from google import genai
from google.genai import types
from unitree_sdk2py.core.channel import ChannelFactoryInitialize from unitree_sdk2py.core.channel import ChannelFactoryInitialize
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
from Project.Sanad.voice.audio_io import AudioIO, Mic, Speaker from Project.Sanad.config import (
GEMINI_VOICE,
RECEIVE_SAMPLE_RATE,
SCRIPTS_DIR,
SEND_SAMPLE_RATE,
)
from Project.Sanad.core.config_loader import section as _cfg_section
from Project.Sanad.voice.audio_io import AudioIO
# ─── LOGGING ───────────────────────────────────────────── # ─── LOGGING ─────────────────────────────────────────────
try: _LOG_CFG = _cfg_section("voice", "sanad_voice")
from Project.Sanad.core.config_loader import section as _cfg_section_log
_LOG_CFG = _cfg_section_log("voice", "sanad_voice")
except Exception:
_LOG_CFG = {}
LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs")) LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs"))
os.makedirs(LOG_DIR, exist_ok=True) os.makedirs(LOG_DIR, exist_ok=True)
_LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2") _LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2")
@ -57,71 +68,54 @@ logging.basicConfig(
logging.StreamHandler(), logging.StreamHandler(),
], ],
) )
log = logging.getLogger("gemini_v2") log = logging.getLogger("sanad_voice")
# ─── CONFIG ──────────────────────────────────────────────
# ─── CONFIG — single source of truth ─────────────────────
#
# Gemini credentials + audio rates live in config/core_config.json
# (exposed via config.py as GEMINI_API_KEY, GEMINI_MODEL, etc).
# Voice-loop-specific tunables live in config/voice_config.json.
try:
from Project.Sanad.config import (
GEMINI_API_KEY, GEMINI_VOICE,
SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE,
)
from Project.Sanad.core.config_loader import section as _cfg_section
_SV = _cfg_section("voice", "sanad_voice")
_MIC = _cfg_section("voice", "mic_udp")
_SP = _cfg_section("voice", "speaker")
_REC = _cfg_section("voice", "recording") _REC = _cfg_section("voice", "recording")
except Exception: _SCRIPTS = _cfg_section("core", "script_files")
GEMINI_API_KEY, GEMINI_VOICE = "", "Charon" _GEMINI_DEFAULTS = _cfg_section("core", "gemini_defaults")
SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE = 16000, 24000, 512
_SV, _MIC, _SP, _REC = {}, {}, {}, {}
API_KEY = GEMINI_API_KEY _PERSONA_FILE = SCRIPTS_DIR / _SCRIPTS.get("persona", "sanad_script.txt")
# Gemini Live model name (without "models/" prefix expected by google-genai SDK)
MODEL = os.environ.get("SANAD_GEMINI_MODEL",
"gemini-2.5-flash-native-audio-preview-12-2025")
VOICE_NAME = GEMINI_VOICE
SEND_RATE = SEND_SAMPLE_RATE RECORD_ENABLED = os.environ.get(
RECEIVE_RATE = RECEIVE_SAMPLE_RATE "SANAD_RECORD",
CHUNK_SAMPLES = CHUNK_SIZE "1" if _REC.get("enabled", True) else "0",
MIC_GAIN = _SV.get("mic_gain", 1.0) ) != "0"
_REC_DIR_REL = _REC.get("dir_relative", "data/recordings")
PLAY_CHUNK = _SV.get("play_chunk_bytes", 96000) RECORD_DIR = Path(os.environ.get(
SILENCE_PCM = b'\x00' * (CHUNK_SAMPLES * 2)
# ─── RECORDING ───────────────────────────────────────────
RECORD_ENABLED = os.environ.get("SANAD_RECORD",
"1" if _REC.get("enabled", True) else "0") != "0"
_rec_dir_rel = _REC.get("dir_relative", "data/recordings")
RECORD_DIR = Path(
os.environ.get(
"SANAD_RECORD_DIR", "SANAD_RECORD_DIR",
str(Path(__file__).resolve().parent.parent / _rec_dir_rel), str(Path(__file__).resolve().parent.parent / _REC_DIR_REL),
) ))
)
SYSTEM_PROMPT = ( _FALLBACK_SYSTEM_PROMPT = (
"You are Marcus, a bilingual humanoid robot assistant made by YS Lootah Technology, Dubai, UAE. " "You are Marcus, a bilingual humanoid robot assistant made by YS Lootah "
"RESPOND IN ARABIC (Gulf/Emirati dialect) OR ENGLISH ONLY. " "Technology, Dubai, UAE. RESPOND IN ARABIC (Gulf/Emirati dialect) OR "
"YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE USER SPEAKS. " "ENGLISH ONLY. YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE "
"If the user speaks Arabic, you MUST reply in Arabic Gulf dialect. " "USER SPEAKS. If the user speaks Arabic, you MUST reply in Arabic Gulf "
"If the user speaks English, you MUST reply in English. " "dialect. If the user speaks English, you MUST reply in English. Do NOT "
"Do NOT confuse Arabic with Japanese, Hindi, Russian, or any other language. " "confuse Arabic with Japanese, Hindi, Russian, or any other language. "
"The user is speaking Arabic or English — nothing else. " "The user is speaking Arabic or English — nothing else. Be concise — 1 "
"Be concise — 1 to 2 sentences max. Be friendly and natural. " "to 2 sentences max. Be friendly and natural. If the user interrupts "
"If the user interrupts and says 'continue' or 'كمل', resume EXACTLY where you stopped. " "and says 'continue' or 'كمل', resume EXACTLY where you stopped. Only "
"Only respond to clear human speech. Ignore background noise and silence completely. " "respond to clear human speech. Ignore background noise and silence "
"Do not respond to sounds that are not words." "completely. Do not respond to sounds that are not words."
) )
# ─── HELPERS ───────────────────────────────────────────── def _load_system_prompt() -> str:
"""scripts/sanad_script.txt → config default → hardcoded fallback."""
try:
text = _PERSONA_FILE.read_text(encoding="utf-8-sig").strip()
if text:
return text
except FileNotFoundError:
pass
return _GEMINI_DEFAULTS.get("default_system_prompt", _FALLBACK_SYSTEM_PROMPT)
def audio_energy(pcm: bytes) -> int:
def _audio_energy(pcm: bytes) -> int:
try: try:
samples = array.array("h", pcm) samples = array.array("h", pcm)
return sum(abs(s) for s in samples) // len(samples) if samples else 0 return sum(abs(s) for s in samples) // len(samples) if samples else 0
@ -132,20 +126,22 @@ def audio_energy(pcm: bytes) -> int:
# ─── TURN RECORDER ────────────────────────────────────── # ─── TURN RECORDER ──────────────────────────────────────
class TurnRecorder: class TurnRecorder:
"""Saves each turn as two WAV files: user mic + Gemini output. """Saves each turn as two WAV files: user mic + model output.
A turn starts when user audio starts flowing through `capture_user` A turn starts when user audio starts flowing through `capture_user`
and ends on `turn_complete`. Files land in SANAD_RECORD_DIR as and ends on `finish_turn`. Files land in `RECORD_DIR` as
`<timestamp>_user.wav` (16 kHz) and `<timestamp>_robot.wav` (24 kHz). `<timestamp>_user.wav` (at `user_rate`) and `<timestamp>_robot.wav`
(at `robot_rate`). An `index.json` in the same directory tracks
An `index.json` maintains a list of all turns with metadata every turn with timestamp + transcripts + durations for the dashboard.
(timestamp, text transcripts, durations) so the dashboard can
browse them later.
""" """
def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR): def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR,
user_rate: int = SEND_SAMPLE_RATE,
robot_rate: int = RECEIVE_SAMPLE_RATE):
self.enabled = enabled self.enabled = enabled
self.out_dir = out_dir self.out_dir = out_dir
self.user_rate = user_rate
self.robot_rate = robot_rate
if self.enabled: if self.enabled:
self.out_dir.mkdir(parents=True, exist_ok=True) self.out_dir.mkdir(parents=True, exist_ok=True)
self._lock = threading.Lock() self._lock = threading.Lock()
@ -182,7 +178,6 @@ class TurnRecorder:
self._robot_text = (self._robot_text + " " + text).strip() self._robot_text = (self._robot_text + " " + text).strip()
def finish_turn(self) -> dict: def finish_turn(self) -> dict:
"""Save current buffers to disk, reset state, return metadata."""
if not self.enabled: if not self.enabled:
return {} return {}
with self._lock: with self._lock:
@ -204,15 +199,17 @@ class TurnRecorder:
"user_text": user_text, "robot_text": robot_text} "user_text": user_text, "robot_text": robot_text}
try: try:
if user_data: if user_data:
user_path = self.out_dir / f"{stamp}_user.wav" p = self.out_dir / f"{stamp}_user.wav"
self._save_wav(user_path, user_data, SEND_RATE) self._save_wav(p, user_data, self.user_rate)
entry["user_wav"] = str(user_path) entry["user_wav"] = str(p)
entry["user_duration_sec"] = round(len(user_data) / (SEND_RATE * 2), 3) entry["user_duration_sec"] = round(
len(user_data) / (self.user_rate * 2), 3)
if robot_data: if robot_data:
robot_path = self.out_dir / f"{stamp}_robot.wav" p = self.out_dir / f"{stamp}_robot.wav"
self._save_wav(robot_path, robot_data, RECEIVE_RATE) self._save_wav(p, robot_data, self.robot_rate)
entry["robot_wav"] = str(robot_path) entry["robot_wav"] = str(p)
entry["robot_duration_sec"] = round(len(robot_data) / (RECEIVE_RATE * 2), 3) entry["robot_duration_sec"] = round(
len(robot_data) / (self.robot_rate * 2), 3)
self._append_index(entry) self._append_index(entry)
log.info("recorded turn → %s (user %.1fs, robot %.1fs)", log.info("recorded turn → %s (user %.1fs, robot %.1fs)",
stamp, stamp,
@ -222,7 +219,8 @@ class TurnRecorder:
log.warning("recording save failed: %s", exc) log.warning("recording save failed: %s", exc)
return entry return entry
def _save_wav(self, path: Path, pcm: bytes, rate: int) -> None: @staticmethod
def _save_wav(path: Path, pcm: bytes, rate: int) -> None:
with wave.open(str(path), "wb") as wf: with wave.open(str(path), "wb") as wf:
wf.setnchannels(1) wf.setnchannels(1)
wf.setsampwidth(2) wf.setsampwidth(2)
@ -242,307 +240,40 @@ class TurnRecorder:
payload = {"records": []} payload = {"records": []}
payload.setdefault("records", []).append(entry) payload.setdefault("records", []).append(entry)
payload["total_records"] = len(payload["records"]) payload["total_records"] = len(payload["records"])
idx_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), idx_path.write_text(
encoding="utf-8") json.dumps(payload, indent=2, ensure_ascii=False),
encoding="utf-8",
# Mic + speaker classes now live in voice/audio_io.py — built via
# AudioIO.from_profile() in main(). Selected with SANAD_AUDIO_PROFILE
# (builtin | anker | hollyland_builtin).
# ─── SESSION ─────────────────────────────────────────────
async def run_session(mic: Mic, speaker: Speaker, voice: str):
client = genai.Client(api_key=API_KEY)
recorder = TurnRecorder(enabled=RECORD_ENABLED)
if RECORD_ENABLED:
log.info("recording enabled → %s", RECORD_DIR)
config = types.LiveConnectConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=voice
)
)
),
realtime_input_config=types.RealtimeInputConfig(
automatic_activity_detection=types.AutomaticActivityDetection(
disabled=False,
start_of_speech_sensitivity=getattr(
types.StartSensitivity,
_cfg_section("voice", "vad").get(
"start_sensitivity", "START_SENSITIVITY_HIGH")),
end_of_speech_sensitivity=getattr(
types.EndSensitivity,
_cfg_section("voice", "vad").get(
"end_sensitivity", "END_SENSITIVITY_LOW")),
prefix_padding_ms=_cfg_section("voice", "vad").get("prefix_padding_ms", 20),
silence_duration_ms=_cfg_section("voice", "vad").get("silence_duration_ms", 200),
)
),
input_audio_transcription=types.AudioTranscriptionConfig(),
output_audio_transcription=types.AudioTranscriptionConfig(),
system_instruction=types.Content(
parts=[types.Part(text=SYSTEM_PROMPT)]
),
) )
session_num = 0
start_time = time.time()
consecutive_errors = 0
while True: # ─── BRAIN FACTORY ───────────────────────────────────────
session_num += 1
speaking = False
stream_started = False
barge_block_until = 0.0
ai_speak_start = 0.0
last_ai_audio = 0.0
_bi = _cfg_section("voice", "barge_in") def _build_brain(name: str, audio_io, recorder, voice: str, system_prompt: str):
BARGE_THRESHOLD = _bi.get("threshold", 500) name = (name or "").strip().lower()
LOUD_CHUNKS_NEEDED = _bi.get("loud_chunks_needed", 3) if name in ("", "gemini"):
BARGE_COOLDOWN = _bi.get("cooldown_sec", 0.3) from Project.Sanad.gemini.script import GeminiBrain
ECHO_SUPPRESS_BELOW = _bi.get("echo_suppress_below", 500) return GeminiBrain(audio_io, recorder, voice, system_prompt)
AI_SPEAK_GRACE_SEC = _bi.get("ai_speak_grace_sec", 0.15) if name == "local":
from Project.Sanad.local.script import LocalBrain
uptime_min = (time.time() - start_time) / 60 return LocalBrain(audio_io, recorder, voice, system_prompt)
if name == "model":
try: from Project.Sanad.voice.model_script import ModelBrain
log.info("connecting to Gemini (session #%d, uptime %.0fm)...", return ModelBrain(audio_io, recorder, voice, system_prompt)
session_num, uptime_min) # To add a provider: import the module and return its brain class here.
async with client.aio.live.connect(model=MODEL, config=config) as session: raise ValueError(f"unknown voice brain: {name!r}")
log.info("connected — speak anytime!")
consecutive_errors = 0 # reset on successful connect
mic.flush()
done = asyncio.Event()
# ── Send mic ──
async def send_mic():
nonlocal speaking, barge_block_until
chunk_bytes = CHUNK_SAMPLES * 2
loud_count = 0
last_activity = time.time()
loop = asyncio.get_event_loop()
while not done.is_set():
try:
raw = await loop.run_in_executor(
None, lambda: mic.read_chunk(chunk_bytes))
except Exception:
break
# Amplify
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
samples = np.clip(samples * MIC_GAIN, -32768, 32767).astype(np.int16)
data = samples.tobytes()
energy = audio_energy(data)
now = time.time()
# Barge-in
if speaking and now >= barge_block_until:
if (now - ai_speak_start) >= AI_SPEAK_GRACE_SEC:
if energy > BARGE_THRESHOLD:
loud_count += 1
else:
loud_count = max(0, loud_count - 1)
if loud_count > LOUD_CHUNKS_NEEDED:
log.info("BARGE-IN (e=%d)", energy)
do_interrupt("barge-in")
loud_count = 0
barge_block_until = now + BARGE_COOLDOWN
# Echo suppression
send_data = data
if speaking and energy < ECHO_SUPPRESS_BELOW:
send_data = SILENCE_PCM[:chunk_bytes]
# Record user audio (only when clearly speaking,
# energy > 250 — skip ambient silence noise)
if energy > 250 and not speaking:
recorder.capture_user(data)
# Watchdog
if energy > 250:
last_activity = now
elif now - last_activity > 10:
log.info("alive (no speech %.0fs, e=%d, buf=%d)",
now - last_activity, energy, len(mic._buf))
last_activity = now
try:
await session.send_realtime_input(
audio=types.Blob(
data=send_data,
mime_type=f"audio/pcm;rate={SEND_RATE}"
)
)
except asyncio.CancelledError:
return
except Exception as e:
log.warning("mic send failed: %s — ending session", e)
done.set()
return
await asyncio.sleep(CHUNK_SAMPLES / SEND_RATE)
log.info("send_mic task ended")
# ── Interrupt helper ──
def do_interrupt(source="local"):
nonlocal speaking, stream_started
speaking = False
stream_started = False
speaker.stop()
mic.flush()
recorder.finish_turn()
# ── Receive ──
async def receive():
nonlocal speaking, stream_started
nonlocal ai_speak_start, last_ai_audio
loop = asyncio.get_event_loop()
try:
last_recv = time.time()
while not done.is_set():
async for response in session.receive():
last_recv = time.time()
if done.is_set():
break
# Server going away — reconnect soon
if hasattr(response, 'go_away') and response.go_away is not None:
log.info("server going away — will reconnect")
done.set()
return
sc = response.server_content
if sc is None:
continue
# Gemini interrupted
if sc.interrupted is True:
if speaking:
log.info("Gemini interrupted")
do_interrupt("gemini")
continue
# User transcript
if sc.input_transcription:
text = (sc.input_transcription.text or "").strip()
if text and not speaking:
log.info("USER: %s", text)
recorder.add_user_text(text)
# Marcus transcript
if sc.output_transcription:
text = (sc.output_transcription.text or "").strip()
if text:
log.info("MARCUS: %s", text)
recorder.add_robot_text(text)
# AI audio
if sc.model_turn:
for part in sc.model_turn.parts:
if part.inline_data and part.inline_data.data:
now = time.time()
if not speaking:
ai_speak_start = now
speaking = True
last_ai_audio = now
raw_audio = part.inline_data.data
recorder.capture_robot(raw_audio)
audio = np.frombuffer(
raw_audio, dtype=np.int16)
if not stream_started:
await loop.run_in_executor(
None, speaker.begin_stream)
stream_started = True
await loop.run_in_executor(
None, speaker.send_chunk,
audio, RECEIVE_RATE)
# Turn complete
if sc.turn_complete:
if speaking and stream_started and not speaker.interrupted:
dur = speaker.total_sent_sec
log.info("speaker %.1fs", dur)
await loop.run_in_executor(
None, speaker.wait_finish)
elif speaking and speaker.interrupted:
log.info("speaker interrupted")
speaking = False
stream_started = False
mic.flush()
recorder.finish_turn()
log.info("listening")
# receive() iterator ended — check if session is still alive
if time.time() - last_recv > 30:
log.warning("no messages from Gemini for 30s — session dead")
break
await asyncio.sleep(0.1)
except Exception as e:
log.warning("receive ended: %s", e)
finally:
done.set()
try:
await asyncio.wait_for(
asyncio.gather(send_mic(), receive()),
timeout=_SV.get("session_timeout_sec", 660), # 11 min max (server go_away at ~10 min)
)
except asyncio.TimeoutError:
log.warning("session timed out after 11 min")
except asyncio.CancelledError:
log.warning("session cancelled")
log.info("session #%d ended — reconnecting in 1s", session_num)
speaker.stop()
mic.flush()
await asyncio.sleep(1)
except asyncio.CancelledError:
log.info("cancelled — stopping")
break
except KeyboardInterrupt:
log.info("keyboard interrupt — stopping")
break
except Exception as e:
consecutive_errors += 1
# Exponential backoff: 2s, 4s, 8s, 16s, max 30s
delay = min(30, 2 ** consecutive_errors)
log.error("session error (#%d): %s — reconnecting in %ds",
consecutive_errors, e, delay)
await asyncio.sleep(delay)
# After 10 consecutive errors, restart the client
if consecutive_errors >= 10:
log.warning("10 consecutive errors — recreating client")
try:
client = genai.Client(api_key=API_KEY)
consecutive_errors = 0
except Exception as ce:
log.error("client recreation failed: %s", ce)
# ─── MAIN ──────────────────────────────────────────────── # ─── MAIN ────────────────────────────────────────────────
def main(): def main() -> None:
if len(sys.argv) < 2: if len(sys.argv) < 2:
print(__doc__) print(__doc__)
sys.exit(1) sys.exit(1)
iface = sys.argv[1] iface = sys.argv[1]
voice = VOICE_NAME voice = GEMINI_VOICE
if "--voice" in sys.argv: if "--voice" in sys.argv:
idx = sys.argv.index("--voice") voice = sys.argv[sys.argv.index("--voice") + 1]
voice = sys.argv[idx + 1]
log.info("DDS on %s", iface) log.info("DDS on %s", iface)
ChannelFactoryInitialize(0, iface) ChannelFactoryInitialize(0, iface)
@ -554,27 +285,39 @@ def main():
profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin") profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin")
audio = AudioIO.from_profile(profile, audio_client=ac) audio = AudioIO.from_profile(profile, audio_client=ac)
audio.start() audio.start()
mic, speaker = audio.mic, audio.speaker
log.info("audio profile=%s", audio.profile_id) log.info("audio profile=%s", audio.profile_id)
# Sanity-check the mic before handing it to the brain
log.info("testing mic 2s...") log.info("testing mic 2s...")
time.sleep(2) time.sleep(2)
test = mic.read_chunk(1024) test = audio.mic.read_chunk(1024)
e = audio_energy(test) e = _audio_energy(test)
log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT") log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT")
log.info("voice=%s log=%s", voice, LOG_FILE) recorder = TurnRecorder(enabled=RECORD_ENABLED)
if RECORD_ENABLED:
log.info("recording enabled → %s", RECORD_DIR)
system_prompt = _load_system_prompt()
brain_name = os.environ.get("SANAD_VOICE_BRAIN", "gemini")
brain = _build_brain(brain_name, audio, recorder, voice, system_prompt)
log.info("voice brain=%s voice=%s log=%s", brain_name, voice, LOG_FILE)
log.info("" * 50) log.info("" * 50)
try: try:
asyncio.run(run_session(mic, speaker, voice)) asyncio.run(brain.run())
except KeyboardInterrupt: except KeyboardInterrupt:
pass pass
except Exception as e: except Exception as exc:
log.error("fatal: %s", e) log.error("fatal: %s", exc)
finally: finally:
log.info("stopped") log.info("stopping")
try:
brain.stop()
except Exception:
log.warning("brain.stop() failed", exc_info=True)
audio.stop() audio.stop()
log.info("stopped")
if __name__ == "__main__": if __name__ == "__main__":