Update 2026-04-20 17:59:46
This commit is contained in:
parent
71c45027f5
commit
94e4a9c4cb
34
config/gemini_config.json
Normal file
34
config/gemini_config.json
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
{
|
||||||
|
"_description": "Tunables for gemini/* modules. Loaded via core.config_loader.load('gemini'). API credentials (api_key, model, voice_name) still live in core_config.json > gemini_defaults — single source of truth shared with config.py.",
|
||||||
|
|
||||||
|
"client": {
|
||||||
|
"_comment": "gemini/client.py — short-session WebSocket client used by dashboard /generate + typed replay. default_system_prompt comes from core.gemini_defaults.",
|
||||||
|
"recv_timeout_sec": 30,
|
||||||
|
"reconnect_max_attempts": 3,
|
||||||
|
"reconnect_initial_delay_sec": 1.0,
|
||||||
|
"reconnect_max_delay_sec": 10.0
|
||||||
|
},
|
||||||
|
|
||||||
|
"subprocess": {
|
||||||
|
"_comment": "gemini/subprocess.py — GeminiSubprocess supervisor. Spawns voice/sanad_voice.py as a child, tails stdout for Gemini-specific log markers, exposes transcript + state to the dashboard.",
|
||||||
|
"log_tail_size": 2000,
|
||||||
|
"transcript_tail_size": 30,
|
||||||
|
"log_name": "gemini_subprocess",
|
||||||
|
"stop_timeout_sec": 3.0,
|
||||||
|
"terminate_timeout_sec": 2.0,
|
||||||
|
"noisy_prefixes": [
|
||||||
|
"ALSA lib ",
|
||||||
|
"Expression 'alsa_",
|
||||||
|
"Cannot connect to server socket",
|
||||||
|
"jack server is not running"
|
||||||
|
],
|
||||||
|
"noisy_fragments": [
|
||||||
|
"Unknown PCM",
|
||||||
|
"Evaluate error",
|
||||||
|
"snd_pcm_open_noupdate",
|
||||||
|
"PaAlsaStream",
|
||||||
|
"snd_config_evaluate",
|
||||||
|
"snd_func_refer"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
92
config/local_config.json
Normal file
92
config/local_config.json
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
{
|
||||||
|
"_description": "Tunables for local/* — fully on-device voice pipeline (Silero VAD → Whisper → Qwen via llama.cpp → CosyVoice2). Loaded via core.config_loader.load('local').",
|
||||||
|
|
||||||
|
"subprocess": {
|
||||||
|
"_comment": "local/subprocess.py — LocalSubprocess supervisor. Mirrors gemini/subprocess.py. IMPORTANT: python_bin points at the `local` conda env (Python 3.8 + Jetson CUDA torch) so CosyVoice+Whisper run with GPU, while the dashboard/Gemini stack stays in gemini_sdk (Python 3.10).",
|
||||||
|
"python_bin": "/home/unitree/miniconda3/envs/local/bin/python",
|
||||||
|
"log_tail_size": 2000,
|
||||||
|
"transcript_tail_size": 30,
|
||||||
|
"log_name": "local_subprocess",
|
||||||
|
"stop_timeout_sec": 5.0,
|
||||||
|
"terminate_timeout_sec": 3.0,
|
||||||
|
"noisy_prefixes": [
|
||||||
|
"ALSA lib ",
|
||||||
|
"Expression 'alsa_",
|
||||||
|
"Cannot connect to server socket",
|
||||||
|
"jack server is not running"
|
||||||
|
],
|
||||||
|
"noisy_fragments": [
|
||||||
|
"Unknown PCM",
|
||||||
|
"Evaluate error",
|
||||||
|
"snd_pcm_open_noupdate",
|
||||||
|
"PaAlsaStream"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
|
||||||
|
"vad": {
|
||||||
|
"_comment": "Silero VAD — CPU. Emits speech_start / speech_end events.",
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"frame_ms": 32,
|
||||||
|
"threshold": 0.55,
|
||||||
|
"min_silence_ms": 400,
|
||||||
|
"min_speech_ms": 250,
|
||||||
|
"pad_start_ms": 200,
|
||||||
|
"pad_end_ms": 200,
|
||||||
|
"device": "cpu"
|
||||||
|
},
|
||||||
|
|
||||||
|
"stt": {
|
||||||
|
"_comment": "faster-whisper Large V3 Turbo, INT8 on GPU.",
|
||||||
|
"model_name": "large-v3-turbo",
|
||||||
|
"model_subdir": "faster-whisper-large-v3-turbo",
|
||||||
|
"device": "cuda",
|
||||||
|
"compute_type": "int8_float16",
|
||||||
|
"beam_size": 1,
|
||||||
|
"language": null,
|
||||||
|
"vad_filter": false,
|
||||||
|
"no_speech_threshold": 0.6,
|
||||||
|
"min_utterance_chars": 2,
|
||||||
|
"temperature": 0.0
|
||||||
|
},
|
||||||
|
|
||||||
|
"llm": {
|
||||||
|
"_comment": "Qwen 2.5 Instruct via Ollama (default) OR self-managed llama.cpp. Set backend to pick.",
|
||||||
|
"backend": "ollama",
|
||||||
|
|
||||||
|
"_ollama_comment": "Ollama daemon — assumes `ollama serve` is running; `ollama pull qwen2.5:1.5b` to fetch.",
|
||||||
|
"ollama_host": "127.0.0.1",
|
||||||
|
"ollama_port": 11434,
|
||||||
|
"ollama_model": "qwen2.5:1.5b",
|
||||||
|
"ollama_keep_alive": "5m",
|
||||||
|
|
||||||
|
"_llamacpp_comment": "Self-managed llama-server subprocess. Only used when backend='llama_cpp'.",
|
||||||
|
"model_subdir": "qwen2.5-1.5b-instruct-q4_k_m.gguf",
|
||||||
|
"server_binary": "llama-server",
|
||||||
|
"host": "127.0.0.1",
|
||||||
|
"port": 8080,
|
||||||
|
"n_gpu_layers": 99,
|
||||||
|
"ctx_size": 2048,
|
||||||
|
"threads": 4,
|
||||||
|
"startup_timeout_sec": 30,
|
||||||
|
|
||||||
|
"_shared_comment": "Generation params — both backends.",
|
||||||
|
"request_timeout_sec": 30,
|
||||||
|
"max_tokens": 200,
|
||||||
|
"temperature": 0.7,
|
||||||
|
"top_p": 0.9,
|
||||||
|
"stop": ["<|im_end|>", "\n\n\n"],
|
||||||
|
"chunk_delimiters": ".,?!؟،",
|
||||||
|
"chunk_min_chars": 8
|
||||||
|
},
|
||||||
|
|
||||||
|
"tts": {
|
||||||
|
"_comment": "CosyVoice2 0.5B streaming — GPU. Uses a 3s reference WAV for voice cloning.",
|
||||||
|
"model_subdir": "CosyVoice2-0.5B",
|
||||||
|
"reference_wav_subdir": "khaleeji_reference_3s.wav",
|
||||||
|
"reference_prompt": "",
|
||||||
|
"stream_chunk_sec": 0.25,
|
||||||
|
"sample_rate": 16000,
|
||||||
|
"queue_max": 3,
|
||||||
|
"device": "cuda"
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -50,39 +50,12 @@
|
|||||||
"dir_relative": "data/recordings"
|
"dir_relative": "data/recordings"
|
||||||
},
|
},
|
||||||
|
|
||||||
"system_prompt": {
|
|
||||||
"_comment": "Persona filename lives in core.script_files.persona; default text in core.gemini_defaults.default_system_prompt. This section is now metadata-only."
|
|
||||||
},
|
|
||||||
|
|
||||||
"typed_replay": {
|
"typed_replay": {
|
||||||
"_comment": "voice/typed_replay.py — max_text_len comes from dashboard.api_input",
|
"_comment": "voice/typed_replay.py — max_text_len comes from dashboard.api_input",
|
||||||
"monitor_chunk_size": 512,
|
"monitor_chunk_size": 512,
|
||||||
"monitor_tail_sec": 0.2
|
"monitor_tail_sec": 0.2
|
||||||
},
|
},
|
||||||
|
|
||||||
"live_gemini_subprocess": {
|
|
||||||
"_comment": "voice/live_gemini_subprocess.py — LiveGeminiSubprocess",
|
|
||||||
"log_tail_size": 2000,
|
|
||||||
"transcript_tail_size": 30,
|
|
||||||
"log_name": "live_gemini_subprocess",
|
|
||||||
"stop_timeout_sec": 3.0,
|
|
||||||
"terminate_timeout_sec": 2.0,
|
|
||||||
"noisy_prefixes": [
|
|
||||||
"ALSA lib ",
|
|
||||||
"Expression 'alsa_",
|
|
||||||
"Cannot connect to server socket",
|
|
||||||
"jack server is not running"
|
|
||||||
],
|
|
||||||
"noisy_fragments": [
|
|
||||||
"Unknown PCM",
|
|
||||||
"Evaluate error",
|
|
||||||
"snd_pcm_open_noupdate",
|
|
||||||
"PaAlsaStream",
|
|
||||||
"snd_config_evaluate",
|
|
||||||
"snd_func_refer"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
|
|
||||||
"live_voice_loop": {
|
"live_voice_loop": {
|
||||||
"_comment": "voice/live_voice_loop.py — arm phrase dispatcher. arm_txt filename comes from core.script_files.arm_phrases",
|
"_comment": "voice/live_voice_loop.py — arm phrase dispatcher. arm_txt filename comes from core.script_files.arm_phrases",
|
||||||
"trigger_log_size": 100,
|
"trigger_log_size": 100,
|
||||||
@ -97,27 +70,5 @@
|
|||||||
"xvector_filename": "arabic_xvector_embedding.pt",
|
"xvector_filename": "arabic_xvector_embedding.pt",
|
||||||
"sample_rate": 16000,
|
"sample_rate": 16000,
|
||||||
"channels": 1
|
"channels": 1
|
||||||
},
|
|
||||||
|
|
||||||
"gemini_client": {
|
|
||||||
"_comment": "voice/gemini_client.py — default_system_prompt comes from core.gemini_defaults",
|
|
||||||
"recv_timeout_sec": 30,
|
|
||||||
"reconnect_max_attempts": 3,
|
|
||||||
"reconnect_initial_delay_sec": 1.0,
|
|
||||||
"reconnect_max_delay_sec": 10.0
|
|
||||||
},
|
|
||||||
|
|
||||||
"asr_buffer": {
|
|
||||||
"_comment": "text_utils.maybe_trigger_arm state machine defaults",
|
|
||||||
"window_sec": 2.0,
|
|
||||||
"short_token_bonus_sec": 1.0,
|
|
||||||
"join_no_space_maxlen": 2,
|
|
||||||
"max_chars": 120,
|
|
||||||
"stream_max_chars": 80,
|
|
||||||
"trigger_dedup_window_sec": 2.0,
|
|
||||||
"pending_arm_ttl_sec": 6.0,
|
|
||||||
"pending_arm_fallback_sec": 0.65,
|
|
||||||
"dup_call_window_sec": 0.25,
|
|
||||||
"dup_asr_repeat_window_sec": 0.9
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -46,7 +46,7 @@ class Brain:
|
|||||||
self._lock = asyncio.Lock()
|
self._lock = asyncio.Lock()
|
||||||
|
|
||||||
# Sub-modules are injected after construction so imports stay lazy.
|
# Sub-modules are injected after construction so imports stay lazy.
|
||||||
self._voice = None # voice.gemini_client.GeminiVoiceClient
|
self._voice = None # gemini.client.GeminiVoiceClient
|
||||||
self._audio_mgr = None # voice.audio_manager.AudioManager
|
self._audio_mgr = None # voice.audio_manager.AudioManager
|
||||||
self._arm = None # motion.arm_controller.ArmController
|
self._arm = None # motion.arm_controller.ArmController
|
||||||
self._macro_rec = None # motion.macro_recorder.MacroRecorder
|
self._macro_rec = None # motion.macro_recorder.MacroRecorder
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
"""Live Voice Commands — voice-to-arm phrase trigger dispatcher.
|
"""Live Voice Commands — voice-to-arm phrase trigger dispatcher.
|
||||||
|
|
||||||
Listens to LiveGeminiSubprocess user transcripts, matches against
|
Listens to GeminiSubprocess user transcripts, matches against
|
||||||
sanad_arm.txt phrases, and fires ARM.trigger_action_by_id.
|
sanad_arm.txt phrases, and fires ARM.trigger_action_by_id.
|
||||||
|
|
||||||
Endpoints:
|
Endpoints:
|
||||||
|
|||||||
@ -193,7 +193,7 @@ async def update_api_key(payload: ApiKeyPayload):
|
|||||||
raise HTTPException(500, f"Could not save config: {exc}")
|
raise HTTPException(500, f"Could not save config: {exc}")
|
||||||
|
|
||||||
# Hot-swap the in-memory module globals.
|
# Hot-swap the in-memory module globals.
|
||||||
# Both Project.Sanad.config AND Project.Sanad.voice.gemini_client
|
# Both Project.Sanad.config AND Project.Sanad.gemini.client
|
||||||
# have their OWN reference to GEMINI_API_KEY (the latter was created
|
# have their OWN reference to GEMINI_API_KEY (the latter was created
|
||||||
# at `from Project.Sanad.config import GEMINI_API_KEY` at import time).
|
# at `from Project.Sanad.config import GEMINI_API_KEY` at import time).
|
||||||
# Python's `from X import Y` binds a local name — updating config.Y
|
# Python's `from X import Y` binds a local name — updating config.Y
|
||||||
@ -205,10 +205,10 @@ async def update_api_key(payload: ApiKeyPayload):
|
|||||||
log.exception("could not patch config.GEMINI_API_KEY")
|
log.exception("could not patch config.GEMINI_API_KEY")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import Project.Sanad.voice.gemini_client as _gc
|
import Project.Sanad.gemini.client as _gc
|
||||||
_gc.GEMINI_API_KEY = key
|
_gc.GEMINI_API_KEY = key
|
||||||
except Exception:
|
except Exception:
|
||||||
log.exception("could not patch gemini_client.GEMINI_API_KEY")
|
log.exception("could not patch gemini.client.GEMINI_API_KEY")
|
||||||
|
|
||||||
# Disconnect any live session so reconnect uses the new key.
|
# Disconnect any live session so reconnect uses the new key.
|
||||||
from Project.Sanad.main import voice_client
|
from Project.Sanad.main import voice_client
|
||||||
|
|||||||
@ -8,7 +8,7 @@ Usage:
|
|||||||
python3 voice_example.py gemini "hello" # one-shot Gemini text→audio
|
python3 voice_example.py gemini "hello" # one-shot Gemini text→audio
|
||||||
python3 voice_example.py local_tts "hello" # local Coqui TTS
|
python3 voice_example.py local_tts "hello" # local Coqui TTS
|
||||||
python3 voice_example.py typed_replay "hello" # typed replay engine
|
python3 voice_example.py typed_replay "hello" # typed replay engine
|
||||||
python3 voice_example.py live # spawn LiveGeminiSubprocess
|
python3 voice_example.py live # spawn GeminiSubprocess
|
||||||
python3 voice_example.py status # show status of all subsystems
|
python3 voice_example.py status # show status of all subsystems
|
||||||
|
|
||||||
Assumes Project.Sanad is importable (run from repo root or with PYTHONPATH set).
|
Assumes Project.Sanad is importable (run from repo root or with PYTHONPATH set).
|
||||||
@ -23,7 +23,7 @@ import sys
|
|||||||
|
|
||||||
def _demo_gemini(text: str) -> None:
|
def _demo_gemini(text: str) -> None:
|
||||||
"""One-shot: connect Gemini, send text, play reply."""
|
"""One-shot: connect Gemini, send text, play reply."""
|
||||||
from Project.Sanad.voice.gemini_client import GeminiVoiceClient
|
from Project.Sanad.gemini.client import GeminiVoiceClient
|
||||||
from Project.Sanad.voice.audio_manager import AudioManager
|
from Project.Sanad.voice.audio_manager import AudioManager
|
||||||
|
|
||||||
async def run():
|
async def run():
|
||||||
@ -55,7 +55,7 @@ def _demo_local_tts(text: str) -> None:
|
|||||||
|
|
||||||
def _demo_typed_replay(text: str) -> None:
|
def _demo_typed_replay(text: str) -> None:
|
||||||
"""Exercise the TypedReplayEngine end-to-end."""
|
"""Exercise the TypedReplayEngine end-to-end."""
|
||||||
from Project.Sanad.voice.gemini_client import GeminiVoiceClient
|
from Project.Sanad.gemini.client import GeminiVoiceClient
|
||||||
from Project.Sanad.voice.audio_manager import AudioManager
|
from Project.Sanad.voice.audio_manager import AudioManager
|
||||||
from Project.Sanad.voice.typed_replay import TypedReplayEngine
|
from Project.Sanad.voice.typed_replay import TypedReplayEngine
|
||||||
|
|
||||||
@ -73,9 +73,9 @@ def _demo_typed_replay(text: str) -> None:
|
|||||||
|
|
||||||
def _demo_live() -> None:
|
def _demo_live() -> None:
|
||||||
"""Spawn the live voice subprocess — same as dashboard /api/live-subprocess."""
|
"""Spawn the live voice subprocess — same as dashboard /api/live-subprocess."""
|
||||||
from Project.Sanad.voice.live_gemini_subprocess import LiveGeminiSubprocess
|
from Project.Sanad.gemini.subprocess import GeminiSubprocess
|
||||||
|
|
||||||
mgr = LiveGeminiSubprocess()
|
mgr = GeminiSubprocess()
|
||||||
info = mgr.start()
|
info = mgr.start()
|
||||||
print(f"[live] {info}")
|
print(f"[live] {info}")
|
||||||
print("Running. Ctrl+C to stop.")
|
print("Running. Ctrl+C to stop.")
|
||||||
@ -90,7 +90,7 @@ def _demo_live() -> None:
|
|||||||
|
|
||||||
def _demo_status() -> None:
|
def _demo_status() -> None:
|
||||||
"""Print status of all voice subsystems."""
|
"""Print status of all voice subsystems."""
|
||||||
from Project.Sanad.voice.gemini_client import GeminiVoiceClient
|
from Project.Sanad.gemini.client import GeminiVoiceClient
|
||||||
try:
|
try:
|
||||||
from Project.Sanad.voice.local_tts import LocalTTSEngine
|
from Project.Sanad.voice.local_tts import LocalTTSEngine
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
0
gemini/__init__.py
Normal file
0
gemini/__init__.py
Normal file
@ -30,7 +30,7 @@ from Project.Sanad.core.logger import get_logger
|
|||||||
|
|
||||||
log = get_logger("gemini_client")
|
log = get_logger("gemini_client")
|
||||||
|
|
||||||
_GC = _cfg_section("voice", "gemini_client")
|
_GC = _cfg_section("gemini", "client")
|
||||||
# Default system prompt — SINGLE SOURCE in core.gemini_defaults
|
# Default system prompt — SINGLE SOURCE in core.gemini_defaults
|
||||||
_DEFAULT_SYSTEM_PROMPT = _cfg_section("core", "gemini_defaults").get(
|
_DEFAULT_SYSTEM_PROMPT = _cfg_section("core", "gemini_defaults").get(
|
||||||
"default_system_prompt",
|
"default_system_prompt",
|
||||||
370
gemini/script.py
Normal file
370
gemini/script.py
Normal file
@ -0,0 +1,370 @@
|
|||||||
|
"""Gemini brain — live conversation loop using the google-genai SDK.
|
||||||
|
|
||||||
|
Implements the VoiceBrain contract documented in `voice/model_script.py`:
|
||||||
|
|
||||||
|
__init__(audio_io, recorder, voice_name, system_prompt)
|
||||||
|
async run()
|
||||||
|
stop()
|
||||||
|
|
||||||
|
Owns everything Gemini-specific: the `genai.Client`, `LiveConnectConfig`,
|
||||||
|
the session connect/receive loop, VAD-based barge-in, echo suppression,
|
||||||
|
reconnect backoff. Hardware I/O is delegated to `audio_io` and per-turn
|
||||||
|
WAV capture to `recorder` — both are model-agnostic.
|
||||||
|
|
||||||
|
Env overrides:
|
||||||
|
SANAD_GEMINI_MODEL — Gemini Live model id (without "models/" prefix)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import array
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from google import genai
|
||||||
|
from google.genai import types
|
||||||
|
|
||||||
|
from Project.Sanad.config import (
|
||||||
|
CHUNK_SIZE,
|
||||||
|
GEMINI_API_KEY,
|
||||||
|
GEMINI_VOICE,
|
||||||
|
RECEIVE_SAMPLE_RATE,
|
||||||
|
SEND_SAMPLE_RATE,
|
||||||
|
)
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("gemini_brain")
|
||||||
|
|
||||||
|
_SV = _cfg_section("voice", "sanad_voice")
|
||||||
|
_VAD = _cfg_section("voice", "vad")
|
||||||
|
_BI = _cfg_section("voice", "barge_in")
|
||||||
|
|
||||||
|
_MODEL = os.environ.get(
|
||||||
|
"SANAD_GEMINI_MODEL",
|
||||||
|
"gemini-2.5-flash-native-audio-preview-12-2025",
|
||||||
|
)
|
||||||
|
_MIC_GAIN = _SV.get("mic_gain", 1.0)
|
||||||
|
_SESSION_TIMEOUT = _SV.get("session_timeout_sec", 660)
|
||||||
|
_MAX_RECONNECT_DELAY = _SV.get("max_reconnect_delay_sec", 30)
|
||||||
|
_MAX_CONSECUTIVE_ERRORS = _SV.get("max_consecutive_errors", 10)
|
||||||
|
_NO_MESSAGES_TIMEOUT = _SV.get("no_messages_timeout_sec", 30)
|
||||||
|
|
||||||
|
_CHUNK_BYTES = CHUNK_SIZE * 2
|
||||||
|
_SILENCE_PCM = b"\x00" * _CHUNK_BYTES
|
||||||
|
|
||||||
|
|
||||||
|
def _audio_energy(pcm: bytes) -> int:
|
||||||
|
try:
|
||||||
|
samples = array.array("h", pcm)
|
||||||
|
return sum(abs(s) for s in samples) // len(samples) if samples else 0
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
class GeminiBrain:
|
||||||
|
"""Gemini Live conversation brain — reconnect-safe."""
|
||||||
|
|
||||||
|
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
|
||||||
|
system_prompt: str = ""):
|
||||||
|
self._audio = audio_io
|
||||||
|
self._mic = audio_io.mic
|
||||||
|
self._speaker = audio_io.speaker
|
||||||
|
self._recorder = recorder
|
||||||
|
self._voice = voice_name or GEMINI_VOICE
|
||||||
|
self._system_prompt = system_prompt
|
||||||
|
self._api_key = GEMINI_API_KEY
|
||||||
|
self._stop_flag = asyncio.Event()
|
||||||
|
# per-session state (reset in the outer reconnect loop)
|
||||||
|
self._speaking = False
|
||||||
|
self._stream_started = False
|
||||||
|
self._barge_block_until = 0.0
|
||||||
|
self._ai_speak_start = 0.0
|
||||||
|
self._last_ai_audio = 0.0
|
||||||
|
self._done: Optional[asyncio.Event] = None
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
"""Signal the run loop to exit at the next opportunity."""
|
||||||
|
try:
|
||||||
|
self._stop_flag.set()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ─── public entry point ───────────────────────────────
|
||||||
|
|
||||||
|
async def run(self) -> None:
|
||||||
|
client = genai.Client(api_key=self._api_key)
|
||||||
|
config = self._build_config()
|
||||||
|
session_num = 0
|
||||||
|
start_time = time.time()
|
||||||
|
consecutive_errors = 0
|
||||||
|
|
||||||
|
while not self._stop_flag.is_set():
|
||||||
|
session_num += 1
|
||||||
|
self._reset_turn_state()
|
||||||
|
uptime_min = (time.time() - start_time) / 60
|
||||||
|
|
||||||
|
try:
|
||||||
|
log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
|
||||||
|
session_num, uptime_min)
|
||||||
|
async with client.aio.live.connect(model=_MODEL, config=config) as session:
|
||||||
|
log.info("connected — speak anytime!")
|
||||||
|
consecutive_errors = 0
|
||||||
|
self._mic.flush()
|
||||||
|
self._done = asyncio.Event()
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
asyncio.gather(
|
||||||
|
self._send_mic_loop(session),
|
||||||
|
self._receive_loop(session),
|
||||||
|
),
|
||||||
|
timeout=_SESSION_TIMEOUT,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
log.warning("session timed out after %ds", _SESSION_TIMEOUT)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
log.warning("session cancelled")
|
||||||
|
|
||||||
|
log.info("session #%d ended — reconnecting in 1s", session_num)
|
||||||
|
self._speaker.stop()
|
||||||
|
self._mic.flush()
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
log.info("cancelled — stopping")
|
||||||
|
break
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
log.info("keyboard interrupt — stopping")
|
||||||
|
break
|
||||||
|
except Exception as exc:
|
||||||
|
consecutive_errors += 1
|
||||||
|
delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
|
||||||
|
log.error("session error (#%d): %s — reconnecting in %ds",
|
||||||
|
consecutive_errors, exc, delay)
|
||||||
|
await asyncio.sleep(delay)
|
||||||
|
if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
|
||||||
|
log.warning("%d consecutive errors — recreating client",
|
||||||
|
consecutive_errors)
|
||||||
|
try:
|
||||||
|
client = genai.Client(api_key=self._api_key)
|
||||||
|
consecutive_errors = 0
|
||||||
|
except Exception as ce:
|
||||||
|
log.error("client recreation failed: %s", ce)
|
||||||
|
|
||||||
|
# ─── Gemini config ────────────────────────────────────
|
||||||
|
|
||||||
|
def _build_config(self) -> types.LiveConnectConfig:
|
||||||
|
return types.LiveConnectConfig(
|
||||||
|
response_modalities=["AUDIO"],
|
||||||
|
speech_config=types.SpeechConfig(
|
||||||
|
voice_config=types.VoiceConfig(
|
||||||
|
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
||||||
|
voice_name=self._voice,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
realtime_input_config=types.RealtimeInputConfig(
|
||||||
|
automatic_activity_detection=types.AutomaticActivityDetection(
|
||||||
|
disabled=False,
|
||||||
|
start_of_speech_sensitivity=getattr(
|
||||||
|
types.StartSensitivity,
|
||||||
|
_VAD.get("start_sensitivity", "START_SENSITIVITY_HIGH"),
|
||||||
|
),
|
||||||
|
end_of_speech_sensitivity=getattr(
|
||||||
|
types.EndSensitivity,
|
||||||
|
_VAD.get("end_sensitivity", "END_SENSITIVITY_LOW"),
|
||||||
|
),
|
||||||
|
prefix_padding_ms=_VAD.get("prefix_padding_ms", 20),
|
||||||
|
silence_duration_ms=_VAD.get("silence_duration_ms", 200),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
input_audio_transcription=types.AudioTranscriptionConfig(),
|
||||||
|
output_audio_transcription=types.AudioTranscriptionConfig(),
|
||||||
|
system_instruction=types.Content(
|
||||||
|
parts=[types.Part(text=self._system_prompt)],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# ─── state helpers ────────────────────────────────────
|
||||||
|
|
||||||
|
def _reset_turn_state(self) -> None:
|
||||||
|
self._speaking = False
|
||||||
|
self._stream_started = False
|
||||||
|
self._barge_block_until = 0.0
|
||||||
|
self._ai_speak_start = 0.0
|
||||||
|
self._last_ai_audio = 0.0
|
||||||
|
|
||||||
|
def _interrupt(self, source: str = "local") -> None:
|
||||||
|
self._speaking = False
|
||||||
|
self._stream_started = False
|
||||||
|
self._speaker.stop()
|
||||||
|
self._mic.flush()
|
||||||
|
self._recorder.finish_turn()
|
||||||
|
log.info("interrupt (%s)", source)
|
||||||
|
|
||||||
|
# ─── mic send loop ────────────────────────────────────
|
||||||
|
|
||||||
|
async def _send_mic_loop(self, session: Any) -> None:
|
||||||
|
threshold = _BI.get("threshold", 500)
|
||||||
|
chunks_needed = _BI.get("loud_chunks_needed", 3)
|
||||||
|
cooldown = _BI.get("cooldown_sec", 0.3)
|
||||||
|
echo_suppress_below = _BI.get("echo_suppress_below", 500)
|
||||||
|
grace = _BI.get("ai_speak_grace_sec", 0.15)
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loud_count = 0
|
||||||
|
last_activity = time.time()
|
||||||
|
|
||||||
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
||||||
|
try:
|
||||||
|
raw = await loop.run_in_executor(
|
||||||
|
None, self._mic.read_chunk, _CHUNK_BYTES,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
break
|
||||||
|
|
||||||
|
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
|
||||||
|
samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
|
||||||
|
data = samples.tobytes()
|
||||||
|
energy = _audio_energy(data)
|
||||||
|
now = time.time()
|
||||||
|
|
||||||
|
# Barge-in: after AI starts speaking, sustained user energy cuts it.
|
||||||
|
if self._speaking and now >= self._barge_block_until:
|
||||||
|
if (now - self._ai_speak_start) >= grace:
|
||||||
|
if energy > threshold:
|
||||||
|
loud_count += 1
|
||||||
|
else:
|
||||||
|
loud_count = max(0, loud_count - 1)
|
||||||
|
if loud_count > chunks_needed:
|
||||||
|
log.info("BARGE-IN (e=%d)", energy)
|
||||||
|
self._interrupt("barge-in")
|
||||||
|
loud_count = 0
|
||||||
|
self._barge_block_until = now + cooldown
|
||||||
|
|
||||||
|
# Echo suppression: while AI is speaking, mask quiet frames so the
|
||||||
|
# mic doesn't feed the model its own voice bleed.
|
||||||
|
send_data = data
|
||||||
|
if self._speaking and energy < echo_suppress_below:
|
||||||
|
send_data = _SILENCE_PCM
|
||||||
|
|
||||||
|
# Record user audio when clearly speaking and AI isn't.
|
||||||
|
if energy > 250 and not self._speaking:
|
||||||
|
self._recorder.capture_user(data)
|
||||||
|
|
||||||
|
# Keep-alive watchdog
|
||||||
|
if energy > 250:
|
||||||
|
last_activity = now
|
||||||
|
elif now - last_activity > 10:
|
||||||
|
log.info("alive (no speech %.0fs, e=%d)",
|
||||||
|
now - last_activity, energy)
|
||||||
|
last_activity = now
|
||||||
|
|
||||||
|
try:
|
||||||
|
await session.send_realtime_input(
|
||||||
|
audio=types.Blob(
|
||||||
|
data=send_data,
|
||||||
|
mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
return
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("mic send failed: %s — ending session", exc)
|
||||||
|
self._done.set()
|
||||||
|
return
|
||||||
|
|
||||||
|
await asyncio.sleep(CHUNK_SIZE / SEND_SAMPLE_RATE)
|
||||||
|
|
||||||
|
log.info("send_mic task ended")
|
||||||
|
|
||||||
|
# ─── receive loop ─────────────────────────────────────
|
||||||
|
|
||||||
|
async def _receive_loop(self, session: Any) -> None:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
try:
|
||||||
|
last_recv = time.time()
|
||||||
|
while not self._done.is_set() and not self._stop_flag.is_set():
|
||||||
|
async for response in session.receive():
|
||||||
|
last_recv = time.time()
|
||||||
|
if self._done.is_set():
|
||||||
|
break
|
||||||
|
|
||||||
|
if hasattr(response, "go_away") and response.go_away is not None:
|
||||||
|
log.info("server going away — will reconnect")
|
||||||
|
self._done.set()
|
||||||
|
return
|
||||||
|
|
||||||
|
sc = response.server_content
|
||||||
|
if sc is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if sc.interrupted is True:
|
||||||
|
if self._speaking:
|
||||||
|
log.info("Gemini interrupted")
|
||||||
|
self._interrupt("gemini")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if sc.input_transcription:
|
||||||
|
text = (sc.input_transcription.text or "").strip()
|
||||||
|
if text and not self._speaking:
|
||||||
|
log.info("USER: %s", text)
|
||||||
|
self._recorder.add_user_text(text)
|
||||||
|
|
||||||
|
if sc.output_transcription:
|
||||||
|
text = (sc.output_transcription.text or "").strip()
|
||||||
|
if text:
|
||||||
|
log.info("BOT : %s", text)
|
||||||
|
self._recorder.add_robot_text(text)
|
||||||
|
|
||||||
|
if sc.model_turn:
|
||||||
|
for part in sc.model_turn.parts:
|
||||||
|
if part.inline_data and part.inline_data.data:
|
||||||
|
now = time.time()
|
||||||
|
if not self._speaking:
|
||||||
|
self._ai_speak_start = now
|
||||||
|
self._speaking = True
|
||||||
|
self._last_ai_audio = now
|
||||||
|
raw_audio = part.inline_data.data
|
||||||
|
self._recorder.capture_robot(raw_audio)
|
||||||
|
audio = np.frombuffer(raw_audio, dtype=np.int16)
|
||||||
|
if not self._stream_started:
|
||||||
|
await loop.run_in_executor(
|
||||||
|
None, self._speaker.begin_stream,
|
||||||
|
)
|
||||||
|
self._stream_started = True
|
||||||
|
await loop.run_in_executor(
|
||||||
|
None, self._speaker.send_chunk,
|
||||||
|
audio, RECEIVE_SAMPLE_RATE,
|
||||||
|
)
|
||||||
|
|
||||||
|
if sc.turn_complete:
|
||||||
|
if (self._speaking and self._stream_started
|
||||||
|
and not self._speaker.interrupted):
|
||||||
|
log.info("speaker %.1fs", self._speaker.total_sent_sec)
|
||||||
|
await loop.run_in_executor(
|
||||||
|
None, self._speaker.wait_finish,
|
||||||
|
)
|
||||||
|
elif self._speaking and self._speaker.interrupted:
|
||||||
|
log.info("speaker interrupted")
|
||||||
|
self._speaking = False
|
||||||
|
self._stream_started = False
|
||||||
|
self._mic.flush()
|
||||||
|
self._recorder.finish_turn()
|
||||||
|
log.info("listening")
|
||||||
|
|
||||||
|
if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
|
||||||
|
log.warning("no messages from Gemini for %ds — session dead",
|
||||||
|
_NO_MESSAGES_TIMEOUT)
|
||||||
|
break
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("receive ended: %s", exc)
|
||||||
|
finally:
|
||||||
|
self._done.set()
|
||||||
@ -1,7 +1,11 @@
|
|||||||
"""Live Gemini Subprocess Manager — start/stop sanad_voice.py as managed child.
|
"""Gemini live subprocess supervisor.
|
||||||
|
|
||||||
Mirrors gemini_voice_v2/LiveGeminiManager. Launches the voice script as a
|
Spawns `voice/sanad_voice.py` as a managed child with `SANAD_VOICE_BRAIN=gemini`,
|
||||||
subprocess, tails stdout, parses state transitions and user transcripts.
|
tails the child's stdout, and extracts state transitions + user transcripts
|
||||||
|
from the Gemini-specific log lines emitted by `gemini/script.py:GeminiBrain`.
|
||||||
|
|
||||||
|
When a new model is added, build its own sibling supervisor (see
|
||||||
|
`voice/model_subprocess.py` for the template) — do not refactor this file.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
@ -22,9 +26,9 @@ from Project.Sanad.config import BASE_DIR, LOGS_DIR, SCRIPTS_DIR, LIVE_TUNE
|
|||||||
from Project.Sanad.core.config_loader import section as _cfg_section
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
from Project.Sanad.core.logger import get_logger
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
log = get_logger("live_subprocess")
|
log = get_logger("gemini_subprocess")
|
||||||
|
|
||||||
_LS_CFG = _cfg_section("voice", "live_gemini_subprocess")
|
_LS_CFG = _cfg_section("gemini", "subprocess")
|
||||||
|
|
||||||
|
|
||||||
def _resolve_live_script() -> Path:
|
def _resolve_live_script() -> Path:
|
||||||
@ -51,7 +55,7 @@ TRANSCRIPT_TAIL_SIZE = _LS_CFG.get("transcript_tail_size", 30)
|
|||||||
|
|
||||||
# Persistent on-disk log for the full subprocess session.
|
# Persistent on-disk log for the full subprocess session.
|
||||||
LIVE_LOG_DIR = LOGS_DIR
|
LIVE_LOG_DIR = LOGS_DIR
|
||||||
LIVE_LOG_NAME = _LS_CFG.get("log_name", "live_gemini_subprocess")
|
LIVE_LOG_NAME = _LS_CFG.get("log_name", "gemini_subprocess")
|
||||||
|
|
||||||
_STOP_TIMEOUT_SEC = _LS_CFG.get("stop_timeout_sec", 3.0)
|
_STOP_TIMEOUT_SEC = _LS_CFG.get("stop_timeout_sec", 3.0)
|
||||||
_TERMINATE_TIMEOUT_SEC = _LS_CFG.get("terminate_timeout_sec", 2.0)
|
_TERMINATE_TIMEOUT_SEC = _LS_CFG.get("terminate_timeout_sec", 2.0)
|
||||||
@ -66,7 +70,7 @@ _NOISY_FRAGMENTS = tuple(_LS_CFG.get("noisy_fragments", [
|
|||||||
]))
|
]))
|
||||||
|
|
||||||
|
|
||||||
class LiveGeminiSubprocess:
|
class GeminiSubprocess:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._lock = threading.Lock()
|
self._lock = threading.Lock()
|
||||||
self.process: subprocess.Popen | None = None
|
self.process: subprocess.Popen | None = None
|
||||||
@ -102,23 +106,33 @@ class LiveGeminiSubprocess:
|
|||||||
self.state_message = msg
|
self.state_message = msg
|
||||||
|
|
||||||
def _track_line(self, line: str):
|
def _track_line(self, line: str):
|
||||||
if "Connecting to Gemini" in line:
|
"""Parse Gemini-specific log markers emitted by `gemini/script.py`.
|
||||||
|
|
||||||
|
Must stay in lock-step with the `log.info(...)` strings in
|
||||||
|
`GeminiBrain`. If you add a new state, add the emit in the brain
|
||||||
|
AND the matching detector here — in one PR.
|
||||||
|
"""
|
||||||
|
if "connecting to Gemini" in line:
|
||||||
self._set_state("connecting", line)
|
self._set_state("connecting", line)
|
||||||
elif "Connected! Sanad is listening" in line:
|
elif "connected — speak anytime" in line or "connected - speak anytime" in line:
|
||||||
self._set_state("listening", "Listening for speech.")
|
self._set_state("listening", "Listening for speech.")
|
||||||
elif "USER SAID:" in line:
|
elif " USER: " in line or line.strip().startswith("USER:"):
|
||||||
text = line.split("USER SAID:", 1)[1].strip()
|
# GeminiBrain emits: log.info("USER: %s", text)
|
||||||
|
text = line.split("USER:", 1)[1].strip()
|
||||||
if text:
|
if text:
|
||||||
self.last_user_text = text
|
self.last_user_text = text
|
||||||
self.user_transcript.append(text)
|
self.user_transcript.append(text)
|
||||||
self._set_state("hearing", f"User: {text}")
|
self._set_state("hearing", f"User: {text}")
|
||||||
elif "Interruption!" in line:
|
elif "BARGE-IN" in line or "Gemini interrupted" in line or "interrupt (" in line:
|
||||||
self._set_state("interrupting", line)
|
self._set_state("interrupting", line)
|
||||||
elif any(k in line for k in ("Mic Error:", "Speaker Error:", "Fatal Error:")):
|
elif "listening" in line.lower() and "no speech" not in line:
|
||||||
|
# Fires on "listening" (post-turn) — keep the state fresh.
|
||||||
|
self._set_state("listening", "Listening for speech.")
|
||||||
|
elif "session error" in line or "client recreation failed" in line:
|
||||||
self._set_state("error", line)
|
self._set_state("error", line)
|
||||||
elif "WebSocket closed." in line:
|
elif "server going away" in line or "session ended" in line or "session dead" in line:
|
||||||
self._set_state("warning", line)
|
self._set_state("warning", line)
|
||||||
elif "Ma'a Salama" in line:
|
elif "keyboard interrupt" in line or "cancelled — stopping" in line:
|
||||||
self._set_state("stopped", line)
|
self._set_state("stopped", line)
|
||||||
|
|
||||||
def _reader_loop(self):
|
def _reader_loop(self):
|
||||||
0
local/__init__.py
Normal file
0
local/__init__.py
Normal file
305
local/llm.py
Normal file
305
local/llm.py
Normal file
@ -0,0 +1,305 @@
|
|||||||
|
"""LLM layer — Qwen 2.5 Instruct via Ollama (default) or self-managed llama.cpp.
|
||||||
|
|
||||||
|
Phase 3 of the local pipeline. Two backends, selectable via
|
||||||
|
`config/local_config.json > llm.backend`:
|
||||||
|
|
||||||
|
"ollama" — talk to a running `ollama serve` daemon (default).
|
||||||
|
No subprocess management, no CUDA build. Just:
|
||||||
|
ollama pull qwen2.5:1.5b
|
||||||
|
# daemon usually auto-starts; if not: `ollama serve &`
|
||||||
|
|
||||||
|
"llama_cpp" — launch our own `llama-server` subprocess. Requires
|
||||||
|
a CUDA build of llama.cpp and a GGUF file at
|
||||||
|
`model/local/<llm.model_subdir>`.
|
||||||
|
|
||||||
|
Both backends stream tokens and chunk them on sentence delimiters so
|
||||||
|
the TTS can start synthesising before the LLM finishes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from typing import AsyncIterator, Optional
|
||||||
|
|
||||||
|
from Project.Sanad.config import MODEL_DIR
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("local_llm")
|
||||||
|
_CFG = _cfg_section("local", "llm")
|
||||||
|
|
||||||
|
BACKEND = (_CFG.get("backend") or "ollama").strip().lower()
|
||||||
|
|
||||||
|
# Ollama
|
||||||
|
OLLAMA_HOST = _CFG.get("ollama_host", "127.0.0.1")
|
||||||
|
OLLAMA_PORT = int(_CFG.get("ollama_port", 11434))
|
||||||
|
OLLAMA_MODEL = _CFG.get("ollama_model", "qwen2.5:1.5b")
|
||||||
|
OLLAMA_KEEP_ALIVE = _CFG.get("ollama_keep_alive", "5m")
|
||||||
|
|
||||||
|
# llama.cpp
|
||||||
|
MODEL_SUBDIR = _CFG.get("model_subdir", "qwen2.5-1.5b-instruct-q4_k_m.gguf")
|
||||||
|
SERVER_BIN = _CFG.get("server_binary", "llama-server")
|
||||||
|
HOST = _CFG.get("host", "127.0.0.1")
|
||||||
|
PORT = int(_CFG.get("port", 8080))
|
||||||
|
N_GPU_LAYERS = _CFG.get("n_gpu_layers", 99)
|
||||||
|
CTX_SIZE = _CFG.get("ctx_size", 2048)
|
||||||
|
THREADS = _CFG.get("threads", 4)
|
||||||
|
STARTUP_TIMEOUT = _CFG.get("startup_timeout_sec", 30)
|
||||||
|
|
||||||
|
# Shared generation params
|
||||||
|
REQUEST_TIMEOUT = _CFG.get("request_timeout_sec", 30)
|
||||||
|
MAX_TOKENS = _CFG.get("max_tokens", 200)
|
||||||
|
TEMPERATURE = _CFG.get("temperature", 0.7)
|
||||||
|
TOP_P = _CFG.get("top_p", 0.9)
|
||||||
|
STOP_SEQS = list(_CFG.get("stop", ["<|im_end|>"]))
|
||||||
|
CHUNK_DELIMS = _CFG.get("chunk_delimiters", ".,?!؟،")
|
||||||
|
CHUNK_MIN_CHARS = int(_CFG.get("chunk_min_chars", 8))
|
||||||
|
|
||||||
|
LOCAL_MODEL_PATH = MODEL_DIR / "local" / MODEL_SUBDIR
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaServer:
|
||||||
|
"""Thin wrapper — owns subprocess (llama.cpp) or no-op (ollama)."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._proc: Optional[subprocess.Popen] = None
|
||||||
|
|
||||||
|
# ─── lifecycle ────────────────────────────────────────
|
||||||
|
|
||||||
|
def start(self) -> None:
|
||||||
|
if BACKEND == "ollama":
|
||||||
|
self._check_ollama()
|
||||||
|
log.info("LLM backend=ollama model=%s (@ %s:%d)",
|
||||||
|
OLLAMA_MODEL, OLLAMA_HOST, OLLAMA_PORT)
|
||||||
|
return
|
||||||
|
if BACKEND == "llama_cpp":
|
||||||
|
self._start_llama_cpp()
|
||||||
|
return
|
||||||
|
raise RuntimeError(f"unknown llm.backend: {BACKEND!r}")
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
if self._proc is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self._proc.terminate()
|
||||||
|
self._proc.wait(timeout=3)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
self._proc.kill()
|
||||||
|
self._proc.wait(timeout=2)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("llama-server stop error: %s", exc)
|
||||||
|
self._proc = None
|
||||||
|
|
||||||
|
def alive(self) -> bool:
|
||||||
|
if BACKEND == "ollama":
|
||||||
|
return self._ping_ollama()
|
||||||
|
return self._proc is not None and self._proc.poll() is None
|
||||||
|
|
||||||
|
# ─── Ollama backend ───────────────────────────────────
|
||||||
|
|
||||||
|
def _check_ollama(self) -> None:
|
||||||
|
"""Verify the Ollama daemon is running + the model is pulled."""
|
||||||
|
import urllib.request
|
||||||
|
tags_url = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/tags"
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(tags_url, timeout=3) as r:
|
||||||
|
body = json.loads(r.read().decode("utf-8"))
|
||||||
|
except Exception as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Ollama daemon not reachable at {tags_url} — is `ollama serve` running? ({exc})"
|
||||||
|
)
|
||||||
|
models = [m.get("name", "") for m in body.get("models", [])]
|
||||||
|
if not any(OLLAMA_MODEL in m for m in models):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Ollama model {OLLAMA_MODEL!r} not pulled. "
|
||||||
|
f"Run: `ollama pull {OLLAMA_MODEL}`. Available: {models}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _ping_ollama(self) -> bool:
|
||||||
|
import urllib.request
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(
|
||||||
|
f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/tags", timeout=1,
|
||||||
|
) as r:
|
||||||
|
return r.status == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _stream_ollama(self, user_text: str, system_prompt: str,
|
||||||
|
cancel: asyncio.Event) -> AsyncIterator[str]:
|
||||||
|
import aiohttp
|
||||||
|
url = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate"
|
||||||
|
payload = {
|
||||||
|
"model": OLLAMA_MODEL,
|
||||||
|
"system": system_prompt,
|
||||||
|
"prompt": user_text,
|
||||||
|
"stream": True,
|
||||||
|
"keep_alive": OLLAMA_KEEP_ALIVE,
|
||||||
|
"options": {
|
||||||
|
"num_predict": MAX_TOKENS,
|
||||||
|
"temperature": TEMPERATURE,
|
||||||
|
"top_p": TOP_P,
|
||||||
|
"stop": STOP_SEQS,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
buf = ""
|
||||||
|
async with aiohttp.ClientSession() as sess:
|
||||||
|
try:
|
||||||
|
async with sess.post(
|
||||||
|
url, json=payload,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)) as resp:
|
||||||
|
async for raw in resp.content:
|
||||||
|
if cancel.is_set():
|
||||||
|
log.info("LLM stream cancelled (barge-in)")
|
||||||
|
return
|
||||||
|
line = raw.decode("utf-8", errors="ignore").strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
obj = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
token = obj.get("response", "")
|
||||||
|
if token:
|
||||||
|
buf += token
|
||||||
|
if len(buf) >= CHUNK_MIN_CHARS and buf[-1] in CHUNK_DELIMS:
|
||||||
|
yield buf.strip()
|
||||||
|
buf = ""
|
||||||
|
if obj.get("done"):
|
||||||
|
break
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
return
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("Ollama stream error: %s", exc)
|
||||||
|
return
|
||||||
|
if buf.strip():
|
||||||
|
yield buf.strip()
|
||||||
|
|
||||||
|
# ─── llama.cpp backend ────────────────────────────────
|
||||||
|
|
||||||
|
def _start_llama_cpp(self) -> None:
|
||||||
|
if self._proc is not None and self._proc.poll() is None:
|
||||||
|
return
|
||||||
|
if not LOCAL_MODEL_PATH.exists():
|
||||||
|
raise RuntimeError(f"LLM model not found at {LOCAL_MODEL_PATH}")
|
||||||
|
bin_path = shutil.which(SERVER_BIN) or SERVER_BIN
|
||||||
|
cmd = [
|
||||||
|
bin_path,
|
||||||
|
"-m", str(LOCAL_MODEL_PATH),
|
||||||
|
"--host", HOST,
|
||||||
|
"--port", str(PORT),
|
||||||
|
"--n-gpu-layers", str(N_GPU_LAYERS),
|
||||||
|
"--ctx-size", str(CTX_SIZE),
|
||||||
|
"--threads", str(THREADS),
|
||||||
|
"--log-disable",
|
||||||
|
]
|
||||||
|
log.info("launching llama-server: %s", " ".join(cmd))
|
||||||
|
self._proc = subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
stdout=subprocess.DEVNULL,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
self._wait_llama_cpp_ready()
|
||||||
|
log.info("llama-server ready (pid=%d)", self._proc.pid)
|
||||||
|
|
||||||
|
def _wait_llama_cpp_ready(self) -> None:
|
||||||
|
import urllib.request
|
||||||
|
deadline = time.time() + STARTUP_TIMEOUT
|
||||||
|
url = f"http://{HOST}:{PORT}/health"
|
||||||
|
while time.time() < deadline:
|
||||||
|
if self._proc and self._proc.poll() is not None:
|
||||||
|
stderr = self._proc.stderr.read() if self._proc.stderr else ""
|
||||||
|
raise RuntimeError(
|
||||||
|
f"llama-server exited early (code={self._proc.returncode}): {stderr[:500]}"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url, timeout=1) as r:
|
||||||
|
if r.status == 200:
|
||||||
|
return
|
||||||
|
except Exception:
|
||||||
|
time.sleep(0.3)
|
||||||
|
raise RuntimeError(f"llama-server did not come up within {STARTUP_TIMEOUT}s")
|
||||||
|
|
||||||
|
async def _stream_llama_cpp(self, user_text: str, system_prompt: str,
|
||||||
|
cancel: asyncio.Event) -> AsyncIterator[str]:
|
||||||
|
import aiohttp
|
||||||
|
prompt = self._format_chatml_prompt(user_text, system_prompt)
|
||||||
|
payload = {
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": True,
|
||||||
|
"n_predict": MAX_TOKENS,
|
||||||
|
"temperature": TEMPERATURE,
|
||||||
|
"top_p": TOP_P,
|
||||||
|
"stop": STOP_SEQS,
|
||||||
|
"cache_prompt": True,
|
||||||
|
}
|
||||||
|
url = f"http://{HOST}:{PORT}/completion"
|
||||||
|
buf = ""
|
||||||
|
async with aiohttp.ClientSession() as sess:
|
||||||
|
try:
|
||||||
|
async with sess.post(
|
||||||
|
url, json=payload,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)) as resp:
|
||||||
|
async for raw in resp.content:
|
||||||
|
if cancel.is_set():
|
||||||
|
log.info("LLM stream cancelled (barge-in)")
|
||||||
|
return
|
||||||
|
line = raw.decode("utf-8", errors="ignore").strip()
|
||||||
|
if not line.startswith("data:"):
|
||||||
|
continue
|
||||||
|
line = line[len("data:"):].strip()
|
||||||
|
if not line or line == "[DONE]":
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
obj = json.loads(line)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
token = obj.get("content", "")
|
||||||
|
if not token:
|
||||||
|
if obj.get("stop"):
|
||||||
|
break
|
||||||
|
continue
|
||||||
|
buf += token
|
||||||
|
if len(buf) >= CHUNK_MIN_CHARS and buf[-1] in CHUNK_DELIMS:
|
||||||
|
yield buf.strip()
|
||||||
|
buf = ""
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
return
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("llama-server stream error: %s", exc)
|
||||||
|
return
|
||||||
|
if buf.strip():
|
||||||
|
yield buf.strip()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _format_chatml_prompt(user_text: str, system_prompt: str) -> str:
|
||||||
|
return (
|
||||||
|
f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
|
||||||
|
f"<|im_start|>user\n{user_text}<|im_end|>\n"
|
||||||
|
f"<|im_start|>assistant\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ─── public streaming entry point ─────────────────────
|
||||||
|
|
||||||
|
async def stream(self, user_text: str, system_prompt: str,
|
||||||
|
cancel: asyncio.Event) -> AsyncIterator[str]:
|
||||||
|
"""Yield sentence-sized text chunks as the LLM generates.
|
||||||
|
|
||||||
|
Chunk boundaries: any char in `CHUNK_DELIMS` AND buffer length
|
||||||
|
≥ `CHUNK_MIN_CHARS`. The final buffer is flushed on completion
|
||||||
|
even without a delimiter. If `cancel` is set, the request is
|
||||||
|
aborted and the generator returns.
|
||||||
|
"""
|
||||||
|
if BACKEND == "ollama":
|
||||||
|
async for chunk in self._stream_ollama(user_text, system_prompt, cancel):
|
||||||
|
yield chunk
|
||||||
|
elif BACKEND == "llama_cpp":
|
||||||
|
async for chunk in self._stream_llama_cpp(user_text, system_prompt, cancel):
|
||||||
|
yield chunk
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"unknown llm.backend: {BACKEND!r}")
|
||||||
259
local/script.py
Normal file
259
local/script.py
Normal file
@ -0,0 +1,259 @@
|
|||||||
|
"""LocalBrain — fully on-device voice pipeline.
|
||||||
|
|
||||||
|
Implements the same contract as `gemini/script.py:GeminiBrain` so
|
||||||
|
`voice/sanad_voice.py` can swap it in via `SANAD_VOICE_BRAIN=local`.
|
||||||
|
Wires together four subsystems:
|
||||||
|
|
||||||
|
Phase 1 — Silero VAD (mic → speech boundaries)
|
||||||
|
Phase 2 — faster-whisper (speech → text)
|
||||||
|
Phase 3 — llama.cpp + Qwen (text → streaming text chunks)
|
||||||
|
Phase 4 — CosyVoice2 streaming (text chunk → cloned-voice audio)
|
||||||
|
Phase 5 — barge-in (user speaks → cancel LLM + stop speaker)
|
||||||
|
Phase 6 — stability — model load fails cleanly, crashes are logged.
|
||||||
|
|
||||||
|
Async structure:
|
||||||
|
run() is the main coroutine. It spawns three tasks:
|
||||||
|
_mic_task — reads mic, VAD, Whisper, pushes user text to _llm_queue
|
||||||
|
_dialogue_task — pops user text, streams LLM tokens into _tts_queue
|
||||||
|
_tts_task — pops text chunks, synthesises, feeds the speaker
|
||||||
|
|
||||||
|
Logging contract (matched by local/subprocess.py._track_line):
|
||||||
|
"connecting to local pipeline"
|
||||||
|
"listening"
|
||||||
|
"USER: <text>"
|
||||||
|
"BOT: <text>"
|
||||||
|
"BARGE-IN (local)"
|
||||||
|
"session error: <msg>"
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
from Project.Sanad.local.llm import LlamaServer
|
||||||
|
from Project.Sanad.local.stt import WhisperSTT
|
||||||
|
from Project.Sanad.local.tts import CosyVoiceTTS
|
||||||
|
from Project.Sanad.local.vad import SileroVAD, FRAME_SAMPLES
|
||||||
|
|
||||||
|
log = get_logger("local_brain")
|
||||||
|
|
||||||
|
_CFG_SV = _cfg_section("voice", "sanad_voice")
|
||||||
|
_CHUNK_BYTES = FRAME_SAMPLES * 2 # int16 mono
|
||||||
|
|
||||||
|
|
||||||
|
class LocalBrain:
|
||||||
|
"""Fully on-device Gemini replacement."""
|
||||||
|
|
||||||
|
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
|
||||||
|
system_prompt: str = ""):
|
||||||
|
self._audio = audio_io
|
||||||
|
self._mic = audio_io.mic
|
||||||
|
self._speaker = audio_io.speaker
|
||||||
|
self._recorder = recorder
|
||||||
|
self._voice = voice_name
|
||||||
|
self._system_prompt = system_prompt
|
||||||
|
|
||||||
|
# subsystems — instantiated here, loaded in run()
|
||||||
|
self._vad = SileroVAD()
|
||||||
|
self._stt = WhisperSTT()
|
||||||
|
self._llm = LlamaServer()
|
||||||
|
self._tts = CosyVoiceTTS()
|
||||||
|
|
||||||
|
# pipeline queues
|
||||||
|
self._llm_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=4)
|
||||||
|
self._tts_queue: asyncio.Queue[str] = asyncio.Queue(maxsize=4)
|
||||||
|
|
||||||
|
# control flags
|
||||||
|
self._stop_flag = asyncio.Event() # full shutdown
|
||||||
|
self._interrupt = asyncio.Event() # per-turn barge-in
|
||||||
|
self._speaking = False
|
||||||
|
self._speak_start_time = 0.0
|
||||||
|
|
||||||
|
# ─── lifecycle ────────────────────────────────────────
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._stop_flag.set()
|
||||||
|
self._interrupt.set()
|
||||||
|
|
||||||
|
async def run(self) -> None:
|
||||||
|
"""Main entry. Loads models, runs pipeline, handles shutdown."""
|
||||||
|
log.info("connecting to local pipeline")
|
||||||
|
try:
|
||||||
|
await asyncio.to_thread(self._vad.start)
|
||||||
|
await asyncio.to_thread(self._stt.start)
|
||||||
|
await asyncio.to_thread(self._llm.start)
|
||||||
|
await asyncio.to_thread(self._tts.start)
|
||||||
|
except Exception as exc:
|
||||||
|
log.error("session error: local pipeline startup failed — %s", exc)
|
||||||
|
return
|
||||||
|
|
||||||
|
log.info("listening")
|
||||||
|
try:
|
||||||
|
await asyncio.gather(
|
||||||
|
self._mic_task(),
|
||||||
|
self._dialogue_task(),
|
||||||
|
self._tts_task(),
|
||||||
|
)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
log.info("cancelled — stopping")
|
||||||
|
except Exception as exc:
|
||||||
|
log.error("session error: %s", exc)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
self._llm.stop()
|
||||||
|
except Exception:
|
||||||
|
log.warning("LlamaServer.stop failed", exc_info=True)
|
||||||
|
self._tts.stop()
|
||||||
|
self._stt.stop()
|
||||||
|
self._vad.stop()
|
||||||
|
log.info("local pipeline stopped")
|
||||||
|
|
||||||
|
# ─── barge-in ─────────────────────────────────────────
|
||||||
|
|
||||||
|
def _begin_barge_in(self) -> None:
|
||||||
|
"""Called from mic task when user starts speaking while bot is."""
|
||||||
|
if not self._speaking:
|
||||||
|
return
|
||||||
|
log.info("BARGE-IN (local)")
|
||||||
|
self._interrupt.set()
|
||||||
|
try:
|
||||||
|
self._speaker.stop()
|
||||||
|
except Exception:
|
||||||
|
log.warning("speaker.stop during barge-in failed", exc_info=True)
|
||||||
|
# drain pipelines — discard any pending LLM/TTS chunks for this turn
|
||||||
|
self._drain_queue(self._llm_queue)
|
||||||
|
self._drain_queue(self._tts_queue)
|
||||||
|
self._speaking = False
|
||||||
|
try:
|
||||||
|
self._recorder.finish_turn()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _drain_queue(q: asyncio.Queue) -> None:
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
q.get_nowait()
|
||||||
|
q.task_done()
|
||||||
|
except asyncio.QueueEmpty:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ─── Task 1: mic → VAD → Whisper → LLM queue ──────────
|
||||||
|
|
||||||
|
async def _mic_task(self) -> None:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
while not self._stop_flag.is_set():
|
||||||
|
try:
|
||||||
|
pcm = await loop.run_in_executor(
|
||||||
|
None, self._mic.read_chunk, _CHUNK_BYTES,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
await asyncio.sleep(0.01)
|
||||||
|
continue
|
||||||
|
|
||||||
|
event = self._vad.process(pcm)
|
||||||
|
if event == "speech_start":
|
||||||
|
# user started talking — if bot is speaking, it's a barge-in
|
||||||
|
if self._speaking:
|
||||||
|
self._begin_barge_in()
|
||||||
|
elif event == "speech_end":
|
||||||
|
utt = self._vad.collected_audio()
|
||||||
|
if not utt:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
self._recorder.capture_user(utt)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
text = await loop.run_in_executor(None, self._stt.transcribe, utt)
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
log.info("USER: %s", text)
|
||||||
|
try:
|
||||||
|
self._recorder.add_user_text(text)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# wake the LLM side — drop older pending item if full (latency > throughput)
|
||||||
|
if self._llm_queue.full():
|
||||||
|
try:
|
||||||
|
self._llm_queue.get_nowait()
|
||||||
|
except asyncio.QueueEmpty:
|
||||||
|
pass
|
||||||
|
await self._llm_queue.put(text)
|
||||||
|
|
||||||
|
# ─── Task 2: LLM streaming → TTS queue ────────────────
|
||||||
|
|
||||||
|
async def _dialogue_task(self) -> None:
|
||||||
|
while not self._stop_flag.is_set():
|
||||||
|
try:
|
||||||
|
user_text = await asyncio.wait_for(
|
||||||
|
self._llm_queue.get(), timeout=0.2)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
continue
|
||||||
|
self._interrupt.clear()
|
||||||
|
full_response = []
|
||||||
|
async for chunk in self._llm.stream(
|
||||||
|
user_text, self._system_prompt, self._interrupt):
|
||||||
|
if self._interrupt.is_set():
|
||||||
|
break
|
||||||
|
full_response.append(chunk)
|
||||||
|
await self._tts_queue.put(chunk)
|
||||||
|
self._llm_queue.task_done()
|
||||||
|
if full_response and not self._interrupt.is_set():
|
||||||
|
bot_text = " ".join(full_response).strip()
|
||||||
|
if bot_text:
|
||||||
|
log.info("BOT: %s", bot_text)
|
||||||
|
try:
|
||||||
|
self._recorder.add_robot_text(bot_text)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ─── Task 3: TTS → speaker ────────────────────────────
|
||||||
|
|
||||||
|
async def _tts_task(self) -> None:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
while not self._stop_flag.is_set():
|
||||||
|
try:
|
||||||
|
chunk_text = await asyncio.wait_for(
|
||||||
|
self._tts_queue.get(), timeout=0.2)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
# idle — if we've been speaking and queue drained, close stream
|
||||||
|
if self._speaking and self._llm_queue.empty() and self._tts_queue.empty():
|
||||||
|
await loop.run_in_executor(None, self._speaker.wait_finish)
|
||||||
|
self._speaking = False
|
||||||
|
log.info("listening")
|
||||||
|
try:
|
||||||
|
self._recorder.finish_turn()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
continue
|
||||||
|
if self._interrupt.is_set():
|
||||||
|
self._tts_queue.task_done()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# synthesise this text chunk → stream to speaker
|
||||||
|
if not self._speaking:
|
||||||
|
await loop.run_in_executor(None, self._speaker.begin_stream)
|
||||||
|
self._speaking = True
|
||||||
|
self._speak_start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
for pcm in self._tts.synthesize_stream(chunk_text):
|
||||||
|
if self._interrupt.is_set():
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
self._recorder.capture_robot(pcm)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
await loop.run_in_executor(
|
||||||
|
None, self._speaker.send_chunk,
|
||||||
|
pcm, self._tts.output_rate,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("TTS chunk failed: %s", exc)
|
||||||
|
finally:
|
||||||
|
self._tts_queue.task_done()
|
||||||
96
local/stt.py
Normal file
96
local/stt.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
"""faster-whisper Large V3 Turbo — GPU INT8 transcription.
|
||||||
|
|
||||||
|
Phase 2 of the local pipeline. Given an utterance (int16 PCM bytes at
|
||||||
|
16 kHz), returns transcribed text. Short / empty / no-speech results are
|
||||||
|
filtered out per config thresholds to avoid firing phantom triggers.
|
||||||
|
|
||||||
|
Install (on the robot, in the `local` env):
|
||||||
|
pip install faster-whisper==1.0.*
|
||||||
|
# model auto-downloads from HuggingFace on first `WhisperModel(...)` call,
|
||||||
|
# OR pre-download to model/local/faster-whisper-large-v3-turbo/ and point
|
||||||
|
# `local.stt.model_subdir` at it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from Project.Sanad.config import MODEL_DIR
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("local_stt")
|
||||||
|
_CFG = _cfg_section("local", "stt")
|
||||||
|
|
||||||
|
MODEL_NAME = _CFG.get("model_name", "large-v3-turbo")
|
||||||
|
MODEL_SUBDIR = _CFG.get("model_subdir", "faster-whisper-large-v3-turbo")
|
||||||
|
DEVICE = _CFG.get("device", "cuda")
|
||||||
|
COMPUTE_TYPE = _CFG.get("compute_type", "int8_float16")
|
||||||
|
BEAM_SIZE = _CFG.get("beam_size", 1)
|
||||||
|
LANGUAGE = _CFG.get("language") # None = auto-detect
|
||||||
|
VAD_FILTER = _CFG.get("vad_filter", False)
|
||||||
|
NO_SPEECH_THRESHOLD = _CFG.get("no_speech_threshold", 0.6)
|
||||||
|
MIN_CHARS = _CFG.get("min_utterance_chars", 2)
|
||||||
|
TEMPERATURE = _CFG.get("temperature", 0.0)
|
||||||
|
|
||||||
|
LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
|
||||||
|
|
||||||
|
|
||||||
|
class WhisperSTT:
|
||||||
|
"""Thin wrapper around faster_whisper.WhisperModel."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._model = None
|
||||||
|
|
||||||
|
def start(self) -> None:
|
||||||
|
"""Load the model into VRAM. ~4 s on first call, 100 ms after."""
|
||||||
|
try:
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
except ImportError as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"WhisperSTT requires 'faster-whisper': {exc}"
|
||||||
|
)
|
||||||
|
|
||||||
|
model_src = str(LOCAL_MODEL_DIR) if LOCAL_MODEL_DIR.exists() else MODEL_NAME
|
||||||
|
log.info("loading Whisper: src=%s device=%s compute=%s",
|
||||||
|
model_src, DEVICE, COMPUTE_TYPE)
|
||||||
|
self._model = WhisperModel(
|
||||||
|
model_src,
|
||||||
|
device=DEVICE,
|
||||||
|
compute_type=COMPUTE_TYPE,
|
||||||
|
)
|
||||||
|
log.info("WhisperSTT ready")
|
||||||
|
|
||||||
|
def transcribe(self, pcm: bytes) -> str:
|
||||||
|
"""Blocking transcription. Returns the full text or ''."""
|
||||||
|
if self._model is None:
|
||||||
|
log.warning("WhisperSTT.transcribe called before start()")
|
||||||
|
return ""
|
||||||
|
if not pcm:
|
||||||
|
return ""
|
||||||
|
audio = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
if audio.size == 0:
|
||||||
|
return ""
|
||||||
|
try:
|
||||||
|
segments, info = self._model.transcribe(
|
||||||
|
audio,
|
||||||
|
beam_size=BEAM_SIZE,
|
||||||
|
language=LANGUAGE,
|
||||||
|
vad_filter=VAD_FILTER,
|
||||||
|
no_speech_threshold=NO_SPEECH_THRESHOLD,
|
||||||
|
temperature=TEMPERATURE,
|
||||||
|
)
|
||||||
|
text = " ".join(seg.text.strip() for seg in segments).strip()
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("Whisper transcribe failed: %s", exc)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if len(text) < MIN_CHARS:
|
||||||
|
log.debug("drop short transcript: %r", text)
|
||||||
|
return ""
|
||||||
|
return text
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._model = None
|
||||||
261
local/subprocess.py
Normal file
261
local/subprocess.py
Normal file
@ -0,0 +1,261 @@
|
|||||||
|
"""Local live subprocess supervisor.
|
||||||
|
|
||||||
|
Spawns `voice/sanad_voice.py` as a managed child with
|
||||||
|
`SANAD_VOICE_BRAIN=local`, tails the child's stdout, and extracts state
|
||||||
|
transitions + user transcripts from the log markers emitted by
|
||||||
|
`local/script.py:LocalBrain`.
|
||||||
|
|
||||||
|
Mirror of `gemini/subprocess.py`. Lives separately so the two supervisors
|
||||||
|
stay decoupled — adding a new model does not touch this file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
from collections import deque
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from Project.Sanad.config import BASE_DIR, LOGS_DIR, SCRIPTS_DIR, LIVE_TUNE
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("local_subprocess")
|
||||||
|
|
||||||
|
_LS_CFG = _cfg_section("local", "subprocess")
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_live_script() -> Path:
|
||||||
|
"""Locate the voice script to run as subprocess (same as Gemini's)."""
|
||||||
|
override = os.environ.get("SANAD_LIVE_SCRIPT", "").strip()
|
||||||
|
if override:
|
||||||
|
p = Path(override).expanduser()
|
||||||
|
if p.exists():
|
||||||
|
return p
|
||||||
|
for c in (BASE_DIR / "voice" / "sanad_voice.py",
|
||||||
|
SCRIPTS_DIR / "sanad_voice.py"):
|
||||||
|
if c.exists():
|
||||||
|
return c
|
||||||
|
return SCRIPTS_DIR / "sanad_voice.py"
|
||||||
|
|
||||||
|
|
||||||
|
LIVE_SCRIPT = _resolve_live_script()
|
||||||
|
LOG_TAIL_SIZE = _LS_CFG.get("log_tail_size", 2000)
|
||||||
|
TRANSCRIPT_TAIL_SIZE = _LS_CFG.get("transcript_tail_size", 30)
|
||||||
|
LIVE_LOG_DIR = LOGS_DIR
|
||||||
|
LIVE_LOG_NAME = _LS_CFG.get("log_name", "local_subprocess")
|
||||||
|
|
||||||
|
# Python binary for the child process. The local pipeline runs in a
|
||||||
|
# separate conda env (Python 3.8 + Jetson CUDA torch + CosyVoice/Whisper);
|
||||||
|
# the dashboard stays in gemini_sdk (Python 3.10). Override with
|
||||||
|
# SANAD_LOCAL_PYTHON env var at runtime.
|
||||||
|
LOCAL_PYTHON_BIN = os.environ.get(
|
||||||
|
"SANAD_LOCAL_PYTHON",
|
||||||
|
_LS_CFG.get("python_bin", sys.executable),
|
||||||
|
)
|
||||||
|
|
||||||
|
_STOP_TIMEOUT_SEC = _LS_CFG.get("stop_timeout_sec", 5.0)
|
||||||
|
_TERMINATE_TIMEOUT_SEC = _LS_CFG.get("terminate_timeout_sec", 3.0)
|
||||||
|
|
||||||
|
_NOISY_PREFIXES = tuple(_LS_CFG.get("noisy_prefixes", [
|
||||||
|
"ALSA lib ", "Expression 'alsa_", "Cannot connect to server socket",
|
||||||
|
"jack server is not running",
|
||||||
|
]))
|
||||||
|
_NOISY_FRAGMENTS = tuple(_LS_CFG.get("noisy_fragments", [
|
||||||
|
"Unknown PCM", "Evaluate error", "snd_pcm_open_noupdate", "PaAlsaStream",
|
||||||
|
]))
|
||||||
|
|
||||||
|
|
||||||
|
class LocalSubprocess:
|
||||||
|
def __init__(self):
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self.process: subprocess.Popen | None = None
|
||||||
|
self.log_tail: deque[str] = deque(maxlen=LOG_TAIL_SIZE)
|
||||||
|
self.user_transcript: deque[str] = deque(maxlen=TRANSCRIPT_TAIL_SIZE)
|
||||||
|
self._reader_thread: threading.Thread | None = None
|
||||||
|
self._log_file = None
|
||||||
|
self.state = "stopped"
|
||||||
|
self.state_message = "Idle."
|
||||||
|
self.last_user_text = ""
|
||||||
|
self.suppressed_noise = 0
|
||||||
|
|
||||||
|
# ─── log I/O ──────────────────────────────────────────
|
||||||
|
|
||||||
|
def _open_session_log(self, pid: int):
|
||||||
|
try:
|
||||||
|
LIVE_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
fname = f"{LIVE_LOG_NAME}_{datetime.now().strftime('%Y%m%d')}.log"
|
||||||
|
fh = open(LIVE_LOG_DIR / fname, "a", encoding="utf-8", buffering=1)
|
||||||
|
fh.write(
|
||||||
|
f"\n===== local subprocess start "
|
||||||
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} pid={pid} =====\n"
|
||||||
|
)
|
||||||
|
return fh
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("Could not open local subprocess log file: %s", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _is_noisy(self, line: str) -> bool:
|
||||||
|
return line.startswith(_NOISY_PREFIXES) or any(f in line for f in _NOISY_FRAGMENTS)
|
||||||
|
|
||||||
|
def _set_state(self, state: str, msg: str):
|
||||||
|
self.state = state
|
||||||
|
self.state_message = msg
|
||||||
|
|
||||||
|
def _track_line(self, line: str):
|
||||||
|
"""Parse log markers emitted by `local/script.py:LocalBrain`.
|
||||||
|
|
||||||
|
Must stay in lock-step with the `log.info(...)` strings there.
|
||||||
|
"""
|
||||||
|
if "connecting to local pipeline" in line:
|
||||||
|
self._set_state("connecting", line)
|
||||||
|
elif " USER: " in line or line.strip().startswith("USER:"):
|
||||||
|
text = line.split("USER:", 1)[1].strip()
|
||||||
|
if text:
|
||||||
|
self.last_user_text = text
|
||||||
|
self.user_transcript.append(text)
|
||||||
|
self._set_state("hearing", f"User: {text}")
|
||||||
|
elif " BOT: " in line or line.strip().startswith("BOT:"):
|
||||||
|
self._set_state("speaking", line.split("BOT:", 1)[1].strip()[:80])
|
||||||
|
elif "BARGE-IN (local)" in line:
|
||||||
|
self._set_state("interrupting", line)
|
||||||
|
elif "session error" in line:
|
||||||
|
self._set_state("error", line)
|
||||||
|
elif "local pipeline stopped" in line or "cancelled — stopping" in line:
|
||||||
|
self._set_state("stopped", line)
|
||||||
|
elif "listening" in line.lower() and "no speech" not in line:
|
||||||
|
self._set_state("listening", "Listening for speech.")
|
||||||
|
|
||||||
|
def _reader_loop(self):
|
||||||
|
proc = self.process
|
||||||
|
if proc is None or proc.stdout is None:
|
||||||
|
return
|
||||||
|
fh = self._open_session_log(proc.pid)
|
||||||
|
self._log_file = fh
|
||||||
|
for line in proc.stdout:
|
||||||
|
clean = line.rstrip()
|
||||||
|
if not clean:
|
||||||
|
continue
|
||||||
|
if fh is not None:
|
||||||
|
try:
|
||||||
|
fh.write(clean + "\n")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
with self._lock:
|
||||||
|
if self._is_noisy(clean):
|
||||||
|
self.suppressed_noise += 1
|
||||||
|
continue
|
||||||
|
self.log_tail.append(clean)
|
||||||
|
self._track_line(clean)
|
||||||
|
with self._lock:
|
||||||
|
self.log_tail.append("Local pipeline process exited.")
|
||||||
|
self._set_state("stopped", "Process exited.")
|
||||||
|
if fh is not None:
|
||||||
|
try:
|
||||||
|
fh.write(
|
||||||
|
f"===== local subprocess exit "
|
||||||
|
f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====\n"
|
||||||
|
)
|
||||||
|
fh.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self._log_file = None
|
||||||
|
|
||||||
|
# ─── lifecycle ────────────────────────────────────────
|
||||||
|
|
||||||
|
def is_running(self) -> bool:
|
||||||
|
with self._lock:
|
||||||
|
return self.process is not None and self.process.poll() is None
|
||||||
|
|
||||||
|
def start(self) -> dict[str, Any]:
|
||||||
|
with self._lock:
|
||||||
|
if self.process is not None and self.process.poll() is None:
|
||||||
|
return {"started": False, "message": "Already running.", "pid": self.process.pid}
|
||||||
|
self._set_state("starting", "Starting local pipeline (loading models)...")
|
||||||
|
|
||||||
|
script = LIVE_SCRIPT
|
||||||
|
if not script.exists():
|
||||||
|
raise RuntimeError(f"Script not found: {script}")
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.update({
|
||||||
|
"PYTHONUNBUFFERED": "1",
|
||||||
|
**LIVE_TUNE,
|
||||||
|
"SANAD_VOICE_BRAIN": "local",
|
||||||
|
})
|
||||||
|
|
||||||
|
dds_iface = env.get("SANAD_DDS_INTERFACE", "eth0")
|
||||||
|
# Use the `local` env's Python so CUDA torch + CosyVoice are available.
|
||||||
|
# Fall back to sys.executable only if the configured bin doesn't exist.
|
||||||
|
py_bin = LOCAL_PYTHON_BIN
|
||||||
|
if not Path(py_bin).exists():
|
||||||
|
log.warning("LOCAL_PYTHON_BIN=%s not found, falling back to %s",
|
||||||
|
py_bin, sys.executable)
|
||||||
|
py_bin = sys.executable
|
||||||
|
cmd = [py_bin, str(script), dds_iface]
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
cwd=str(script.parent),
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
text=True,
|
||||||
|
bufsize=1,
|
||||||
|
env=env,
|
||||||
|
)
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self.process = proc
|
||||||
|
self.log_tail.append(f"Started: pid={proc.pid}")
|
||||||
|
self._set_state("starting", f"pid={proc.pid}")
|
||||||
|
self._reader_thread = threading.Thread(target=self._reader_loop, daemon=True)
|
||||||
|
self._reader_thread.start()
|
||||||
|
|
||||||
|
log.info("Local subprocess started: pid=%d", proc.pid)
|
||||||
|
return {"started": True, "pid": proc.pid}
|
||||||
|
|
||||||
|
def stop(self) -> dict[str, Any]:
|
||||||
|
with self._lock:
|
||||||
|
proc = self.process
|
||||||
|
if proc is None or proc.poll() is not None:
|
||||||
|
return {"stopped": False, "message": "Not running."}
|
||||||
|
self._set_state("stopping", "Stopping...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc.send_signal(signal.SIGINT)
|
||||||
|
proc.wait(timeout=_STOP_TIMEOUT_SEC)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
proc.terminate()
|
||||||
|
try:
|
||||||
|
proc.wait(timeout=_TERMINATE_TIMEOUT_SEC)
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
proc.kill()
|
||||||
|
proc.wait(timeout=_TERMINATE_TIMEOUT_SEC)
|
||||||
|
|
||||||
|
rc = proc.returncode
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self.process = None
|
||||||
|
self.log_tail.append("Stopped.")
|
||||||
|
self._set_state("stopped", "Stopped.")
|
||||||
|
|
||||||
|
log.info("Local subprocess stopped (rc=%s)", rc)
|
||||||
|
return {"stopped": True, "returncode": rc}
|
||||||
|
|
||||||
|
def status(self) -> dict[str, Any]:
|
||||||
|
with self._lock:
|
||||||
|
running = self.process is not None and self.process.poll() is None
|
||||||
|
return {
|
||||||
|
"running": running,
|
||||||
|
"pid": self.process.pid if running and self.process else None,
|
||||||
|
"state": self.state,
|
||||||
|
"state_message": self.state_message,
|
||||||
|
"last_user_text": self.last_user_text,
|
||||||
|
"user_transcript": list(self.user_transcript),
|
||||||
|
"log_tail": list(self.log_tail),
|
||||||
|
"suppressed_noise": self.suppressed_noise,
|
||||||
|
}
|
||||||
126
local/tts.py
Normal file
126
local/tts.py
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
"""CosyVoice2 0.5B streaming TTS — GPU.
|
||||||
|
|
||||||
|
Phase 4 of the local pipeline. Holds a 3-second reference WAV in VRAM
|
||||||
|
and synthesises streaming Arabic/English audio for every text chunk
|
||||||
|
arriving from the LLM. Emits int16 PCM at the model's native rate
|
||||||
|
(CosyVoice2 outputs 22 050 Hz — we resample to `sample_rate` from
|
||||||
|
config so the downstream `audio_io.speaker` gets a consistent rate).
|
||||||
|
|
||||||
|
Install (on the robot):
|
||||||
|
cd ~/src
|
||||||
|
git clone --recursive https://github.com/FunAudioLLM/CosyVoice
|
||||||
|
cd CosyVoice
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install -e .
|
||||||
|
|
||||||
|
# model + reference voice
|
||||||
|
huggingface-cli download FunAudioLLM/CosyVoice2-0.5B \\
|
||||||
|
--local-dir ~/sanad/model/local/CosyVoice2-0.5B
|
||||||
|
# place a 3-s Khaleeji clip at model/local/khaleeji_reference_3s.wav
|
||||||
|
# (16 kHz mono int16 WAV)
|
||||||
|
|
||||||
|
API note:
|
||||||
|
CosyVoice2 is evolving. We use the published `inference_zero_shot`
|
||||||
|
with `stream=True` which yields `{"tts_speech": tensor}` chunks.
|
||||||
|
If the upstream API renames, adapt in one place — `TtsEngine._stream`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import AsyncIterator, Iterator, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from Project.Sanad.config import MODEL_DIR
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("local_tts")
|
||||||
|
_CFG = _cfg_section("local", "tts")
|
||||||
|
|
||||||
|
MODEL_SUBDIR = _CFG.get("model_subdir", "CosyVoice2-0.5B")
|
||||||
|
REFERENCE_WAV_SUBDIR = _CFG.get("reference_wav_subdir", "khaleeji_reference_3s.wav")
|
||||||
|
REFERENCE_PROMPT = _CFG.get("reference_prompt", "")
|
||||||
|
OUT_RATE = int(_CFG.get("sample_rate", 16000))
|
||||||
|
QUEUE_MAX = int(_CFG.get("queue_max", 3))
|
||||||
|
DEVICE = _CFG.get("device", "cuda")
|
||||||
|
|
||||||
|
LOCAL_MODEL_DIR = MODEL_DIR / "local" / MODEL_SUBDIR
|
||||||
|
REFERENCE_WAV_PATH = MODEL_DIR / "local" / REFERENCE_WAV_SUBDIR
|
||||||
|
|
||||||
|
|
||||||
|
def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
|
||||||
|
if src_rate == dst_rate or pcm.size == 0:
|
||||||
|
return pcm.astype(np.int16, copy=False)
|
||||||
|
target_len = max(1, int(len(pcm) * dst_rate / src_rate))
|
||||||
|
return np.interp(
|
||||||
|
np.linspace(0, len(pcm), target_len, endpoint=False),
|
||||||
|
np.arange(len(pcm)),
|
||||||
|
pcm.astype(np.float64),
|
||||||
|
).astype(np.int16)
|
||||||
|
|
||||||
|
|
||||||
|
class CosyVoiceTTS:
|
||||||
|
"""Thin async wrapper around CosyVoice2 streaming inference."""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._model = None
|
||||||
|
self._ref_speech = None # preloaded reference tensor
|
||||||
|
self._ref_prompt = REFERENCE_PROMPT
|
||||||
|
self._model_rate: int = 22050
|
||||||
|
|
||||||
|
def start(self) -> None:
|
||||||
|
try:
|
||||||
|
from cosyvoice.cli.cosyvoice import CosyVoice2
|
||||||
|
from cosyvoice.utils.file_utils import load_wav
|
||||||
|
except ImportError as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"CosyVoiceTTS requires the CosyVoice package from source: {exc}"
|
||||||
|
)
|
||||||
|
if not LOCAL_MODEL_DIR.exists():
|
||||||
|
raise RuntimeError(f"CosyVoice2 model not found at {LOCAL_MODEL_DIR}")
|
||||||
|
if not REFERENCE_WAV_PATH.exists():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Reference voice WAV not found at {REFERENCE_WAV_PATH}"
|
||||||
|
)
|
||||||
|
log.info("loading CosyVoice2: %s", LOCAL_MODEL_DIR)
|
||||||
|
self._model = CosyVoice2(str(LOCAL_MODEL_DIR), load_jit=True, fp16=True)
|
||||||
|
# model.sample_rate is an instance attr on CosyVoice2
|
||||||
|
self._model_rate = getattr(self._model, "sample_rate", 22050)
|
||||||
|
self._ref_speech = load_wav(str(REFERENCE_WAV_PATH), 16000)
|
||||||
|
log.info("CosyVoiceTTS ready (model_rate=%d)", self._model_rate)
|
||||||
|
|
||||||
|
def synthesize_stream(self, text: str) -> Iterator[bytes]:
|
||||||
|
"""Yield int16 PCM bytes at OUT_RATE, one streaming chunk at a time."""
|
||||||
|
if self._model is None or self._ref_speech is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
# CosyVoice2 streaming generator. Each step yields a tensor
|
||||||
|
# of float32 waveform samples at the model's native rate.
|
||||||
|
for step in self._model.inference_zero_shot(
|
||||||
|
text,
|
||||||
|
self._ref_prompt,
|
||||||
|
self._ref_speech,
|
||||||
|
stream=True):
|
||||||
|
wave = step.get("tts_speech")
|
||||||
|
if wave is None:
|
||||||
|
continue
|
||||||
|
# tensor → float32 numpy → int16 at OUT_RATE
|
||||||
|
arr = wave.cpu().numpy().squeeze()
|
||||||
|
if arr.size == 0:
|
||||||
|
continue
|
||||||
|
pcm_i16 = np.clip(arr * 32767.0, -32768, 32767).astype(np.int16)
|
||||||
|
if self._model_rate != OUT_RATE:
|
||||||
|
pcm_i16 = _resample_int16(pcm_i16, self._model_rate, OUT_RATE)
|
||||||
|
yield pcm_i16.tobytes()
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("TTS synth failed for chunk %r: %s", text[:40], exc)
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._model = None
|
||||||
|
self._ref_speech = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def output_rate(self) -> int:
|
||||||
|
return OUT_RATE
|
||||||
150
local/vad.py
Normal file
150
local/vad.py
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
"""Silero VAD wrapper — CPU-only speech boundary detection.
|
||||||
|
|
||||||
|
Phase 1 of the local pipeline. Consumes 16 kHz mono int16 PCM in short
|
||||||
|
frames, emits speech_start / speech_end events. All thresholds + frame
|
||||||
|
sizes come from config/local_config.json > vad.
|
||||||
|
|
||||||
|
Install (on the robot):
|
||||||
|
pip install silero-vad torch==2.2.* torchaudio==2.2.*
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
vad = SileroVAD()
|
||||||
|
vad.start()
|
||||||
|
evt = vad.process(pcm_bytes)
|
||||||
|
if evt == 'speech_start': ...
|
||||||
|
elif evt == 'speech_end': buf = vad.collected_audio()
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("local_vad")
|
||||||
|
_CFG = _cfg_section("local", "vad")
|
||||||
|
|
||||||
|
SAMPLE_RATE = _CFG.get("sample_rate", 16000)
|
||||||
|
FRAME_MS = _CFG.get("frame_ms", 32)
|
||||||
|
THRESHOLD = _CFG.get("threshold", 0.55)
|
||||||
|
MIN_SILENCE_MS = _CFG.get("min_silence_ms", 400)
|
||||||
|
MIN_SPEECH_MS = _CFG.get("min_speech_ms", 250)
|
||||||
|
PAD_START_MS = _CFG.get("pad_start_ms", 200)
|
||||||
|
PAD_END_MS = _CFG.get("pad_end_ms", 200)
|
||||||
|
|
||||||
|
FRAME_SAMPLES = SAMPLE_RATE * FRAME_MS // 1000 # 512 @ 16k/32ms
|
||||||
|
|
||||||
|
|
||||||
|
class SileroVAD:
|
||||||
|
"""Streaming VAD with buffered utterance capture.
|
||||||
|
|
||||||
|
Fed one mic frame at a time via `process()`. Internal state tracks
|
||||||
|
whether we're inside an utterance; on speech_end, `collected_audio()`
|
||||||
|
returns the full utterance (with configured padding).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._model = None
|
||||||
|
self._audio_buf: list[bytes] = [] # utterance being collected
|
||||||
|
self._pre_buf: list[bytes] = [] # rolling "pre-speech" ring
|
||||||
|
self._pre_frames = max(1, PAD_START_MS // FRAME_MS)
|
||||||
|
self._pad_end_frames = max(1, PAD_END_MS // FRAME_MS)
|
||||||
|
self._in_speech = False
|
||||||
|
self._last_speech_time = 0.0
|
||||||
|
self._speech_start_time = 0.0
|
||||||
|
self._trailing_silence_frames = 0
|
||||||
|
self._last_utterance: Optional[bytes] = None
|
||||||
|
|
||||||
|
def start(self) -> None:
|
||||||
|
"""Load the Silero model once. Call before `process()`."""
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
from silero_vad import load_silero_vad
|
||||||
|
except ImportError as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"SileroVAD requires 'silero-vad' + torch: {exc}"
|
||||||
|
)
|
||||||
|
self._model = load_silero_vad()
|
||||||
|
log.info("SileroVAD ready (threshold=%.2f, frame=%dms)",
|
||||||
|
THRESHOLD, FRAME_MS)
|
||||||
|
|
||||||
|
def process(self, pcm: bytes) -> Optional[str]:
|
||||||
|
"""Feed one frame (≈ FRAME_MS of audio). Returns an event or None.
|
||||||
|
|
||||||
|
Events: 'speech_start' | 'speech_end' | None
|
||||||
|
"""
|
||||||
|
if self._model is None:
|
||||||
|
return None
|
||||||
|
# keep a rolling pre-buffer so captured utterances include lead-in
|
||||||
|
self._pre_buf.append(pcm)
|
||||||
|
if len(self._pre_buf) > self._pre_frames:
|
||||||
|
self._pre_buf.pop(0)
|
||||||
|
|
||||||
|
# VAD expects float32 in [-1, 1]
|
||||||
|
arr = np.frombuffer(pcm, dtype=np.int16).astype(np.float32) / 32768.0
|
||||||
|
if arr.size < FRAME_SAMPLES:
|
||||||
|
# pad if short tail chunk arrived
|
||||||
|
arr = np.concatenate([arr, np.zeros(FRAME_SAMPLES - arr.size, dtype=np.float32)])
|
||||||
|
elif arr.size > FRAME_SAMPLES:
|
||||||
|
arr = arr[:FRAME_SAMPLES]
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
with torch.no_grad():
|
||||||
|
prob = float(self._model(torch.from_numpy(arr), SAMPLE_RATE).item())
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("VAD inference failed: %s", exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
is_speech = prob >= THRESHOLD
|
||||||
|
|
||||||
|
if is_speech:
|
||||||
|
self._trailing_silence_frames = 0
|
||||||
|
self._last_speech_time = now
|
||||||
|
if not self._in_speech:
|
||||||
|
# transition → speech
|
||||||
|
self._in_speech = True
|
||||||
|
self._speech_start_time = now
|
||||||
|
self._audio_buf = list(self._pre_buf) # seed with pad
|
||||||
|
self._audio_buf.append(pcm)
|
||||||
|
return "speech_start"
|
||||||
|
self._audio_buf.append(pcm)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# silent frame
|
||||||
|
if self._in_speech:
|
||||||
|
self._audio_buf.append(pcm) # collect trailing pad
|
||||||
|
self._trailing_silence_frames += 1
|
||||||
|
silence_ms = self._trailing_silence_frames * FRAME_MS
|
||||||
|
if silence_ms >= MIN_SILENCE_MS:
|
||||||
|
# speech ended — validate min_speech
|
||||||
|
speech_dur_ms = (now - self._speech_start_time) * 1000
|
||||||
|
self._in_speech = False
|
||||||
|
if speech_dur_ms < MIN_SPEECH_MS:
|
||||||
|
log.debug("drop short utterance (%.0fms)", speech_dur_ms)
|
||||||
|
self._audio_buf.clear()
|
||||||
|
self._last_utterance = None
|
||||||
|
return None
|
||||||
|
self._last_utterance = b"".join(self._audio_buf)
|
||||||
|
self._audio_buf.clear()
|
||||||
|
return "speech_end"
|
||||||
|
return None
|
||||||
|
|
||||||
|
def collected_audio(self) -> Optional[bytes]:
|
||||||
|
"""After a speech_end event, return the full utterance bytes."""
|
||||||
|
return self._last_utterance
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
"""Drop any in-flight utterance (used on barge-in)."""
|
||||||
|
self._in_speech = False
|
||||||
|
self._audio_buf.clear()
|
||||||
|
self._trailing_silence_frames = 0
|
||||||
|
self._last_utterance = None
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
self._model = None
|
||||||
16
main.py
16
main.py
@ -11,6 +11,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import importlib
|
import importlib
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import types
|
import types
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -92,8 +93,9 @@ LocalTTSEngine = _safe_import("LocalTTSEngine", lambda: __import__("
|
|||||||
WakePhraseManager = _safe_import("WakePhraseManager", lambda: __import__("Project.Sanad.voice.wake_phrase_manager", fromlist=["WakePhraseManager"]).WakePhraseManager)
|
WakePhraseManager = _safe_import("WakePhraseManager", lambda: __import__("Project.Sanad.voice.wake_phrase_manager", fromlist=["WakePhraseManager"]).WakePhraseManager)
|
||||||
LiveVoiceLoop = _safe_import("LiveVoiceLoop", lambda: __import__("Project.Sanad.voice.live_voice_loop", fromlist=["LiveVoiceLoop"]).LiveVoiceLoop)
|
LiveVoiceLoop = _safe_import("LiveVoiceLoop", lambda: __import__("Project.Sanad.voice.live_voice_loop", fromlist=["LiveVoiceLoop"]).LiveVoiceLoop)
|
||||||
TypedReplayEngine = _safe_import("TypedReplayEngine", lambda: __import__("Project.Sanad.voice.typed_replay", fromlist=["TypedReplayEngine"]).TypedReplayEngine)
|
TypedReplayEngine = _safe_import("TypedReplayEngine", lambda: __import__("Project.Sanad.voice.typed_replay", fromlist=["TypedReplayEngine"]).TypedReplayEngine)
|
||||||
GeminiVoiceClient = _safe_import("GeminiVoiceClient", lambda: __import__("Project.Sanad.voice.gemini_client", fromlist=["GeminiVoiceClient"]).GeminiVoiceClient)
|
GeminiVoiceClient = _safe_import("GeminiVoiceClient", lambda: __import__("Project.Sanad.gemini.client", fromlist=["GeminiVoiceClient"]).GeminiVoiceClient)
|
||||||
LiveGeminiSubprocess = _safe_import("LiveGeminiSubprocess", lambda: __import__("Project.Sanad.voice.live_gemini_subprocess", fromlist=["LiveGeminiSubprocess"]).LiveGeminiSubprocess)
|
GeminiSubprocess = _safe_import("GeminiSubprocess", lambda: __import__("Project.Sanad.gemini.subprocess", fromlist=["GeminiSubprocess"]).GeminiSubprocess)
|
||||||
|
LocalSubprocess = _safe_import("LocalSubprocess", lambda: __import__("Project.Sanad.local.subprocess", fromlist=["LocalSubprocess"]).LocalSubprocess)
|
||||||
|
|
||||||
|
|
||||||
# ── global instances (imported by route modules) ──
|
# ── global instances (imported by route modules) ──
|
||||||
@ -108,7 +110,15 @@ macro_rec = _safe_construct("macro_rec", (lambda: MacroRecorder(arm)) if
|
|||||||
macro_play = _safe_construct("macro_play", (lambda: MacroPlayer(audio_mgr, arm)) if (MacroPlayer and arm) else None)
|
macro_play = _safe_construct("macro_play", (lambda: MacroPlayer(audio_mgr, arm)) if (MacroPlayer and arm) else None)
|
||||||
teacher = _safe_construct("teacher", (lambda: TeachingSession(arm)) if (TeachingSession and arm) else None)
|
teacher = _safe_construct("teacher", (lambda: TeachingSession(arm)) if (TeachingSession and arm) else None)
|
||||||
live_voice = _safe_construct("live_voice", (lambda: LiveVoiceLoop(voice_client, arm, wake_mgr, audio_mgr)) if (LiveVoiceLoop and voice_client and arm and wake_mgr and audio_mgr) else None)
|
live_voice = _safe_construct("live_voice", (lambda: LiveVoiceLoop(voice_client, arm, wake_mgr, audio_mgr)) if (LiveVoiceLoop and voice_client and arm and wake_mgr and audio_mgr) else None)
|
||||||
live_sub = _safe_construct("live_sub", LiveGeminiSubprocess)
|
# Which voice supervisor to mount. SANAD_VOICE_BRAIN chooses the brain
|
||||||
|
# that runs INSIDE the subprocess (see voice/sanad_voice.py); the same
|
||||||
|
# env var picks WHICH supervisor here manages that subprocess so its
|
||||||
|
# log-line parser matches the brain's emit format.
|
||||||
|
_brain_choice = os.environ.get("SANAD_VOICE_BRAIN", "gemini").strip().lower()
|
||||||
|
if _brain_choice == "local" and LocalSubprocess is not None:
|
||||||
|
live_sub = _safe_construct("live_sub", LocalSubprocess)
|
||||||
|
else:
|
||||||
|
live_sub = _safe_construct("live_sub", GeminiSubprocess)
|
||||||
typed_replay = _safe_construct("typed_replay", (lambda: TypedReplayEngine(voice_client, audio_mgr)) if (TypedReplayEngine and voice_client and audio_mgr) else None)
|
typed_replay = _safe_construct("typed_replay", (lambda: TypedReplayEngine(voice_client, audio_mgr)) if (TypedReplayEngine and voice_client and audio_mgr) else None)
|
||||||
|
|
||||||
# Wire everything into the Brain (only what was constructed)
|
# Wire everything into the Brain (only what was constructed)
|
||||||
|
|||||||
@ -1,51 +0,0 @@
|
|||||||
أنت "بوسنده" — مساعد صوتي إماراتي ذكي تابع لروبوت شركة لوتاه تيك Lootah Tech.
|
|
||||||
|
|
||||||
[أولاً: الروح الرمضانية والمعرفة]
|
|
||||||
|
|
||||||
أنت على علم تام بأننا في شهر رمضان المبارك لعام 2026.
|
|
||||||
|
|
||||||
لديك معرفة واسعة بالقرآن الكريم والأحاديث النبوية الشريفة للإجابة على أي سؤال ديني أو تقديم تذكيرات إيمانية بدقة مع ذكر المصدر.
|
|
||||||
|
|
||||||
إذا سألك المستخدم عن آية، حديث، أو حكم صيام، جاوب بوقار وتبسيط بلهجتك الإماراتية.
|
|
||||||
|
|
||||||
[ثانياً: الأسلوب واللغة (التبديل المرن)]
|
|
||||||
|
|
||||||
تكلم باللهجة الإماراتية بشكل طبيعي بدون مبالغة.
|
|
||||||
|
|
||||||
قاعدة التبديل الفوري: إذا استخدم المستخدم أي لغة ثانية في أي لحظة، غيّر فوراً ورد بنفس اللغة الجديدة في نفس الرد.
|
|
||||||
|
|
||||||
إذا رجع المستخدم للعربي: أرجع فوراً للعربي (لهجة إماراتية).
|
|
||||||
|
|
||||||
"آخر لغة كتب فيها المستخدم" هي اللغة اللي ترد فيها.
|
|
||||||
|
|
||||||
ممنوع تخلط لغتين في نفس الرد إلا لطلب ترجمة أو مقارنة صريحة.
|
|
||||||
|
|
||||||
[ثالثاً: التفاعل والبدايات]
|
|
||||||
|
|
||||||
خلك محترم، ودود، ومباشر، وركّز على الزبدة والحل العملي.
|
|
||||||
|
|
||||||
تنويع البداية: استخدم عبارات مثل (مبارك عليك الشهر، عساكم من عواده، تقبل الله طاعتكم، فالك طيب، أبشر بعزك، مرحبابك) ولا تكرر نفس العبارة مرتين متتاليتين.
|
|
||||||
|
|
||||||
إذا كان السؤال تقني سريع أو كود، ابدأ مباشرة بدون مقدمات.
|
|
||||||
|
|
||||||
[رابعاً: إنجاز المهام وقوة الذاكرة العمومية]
|
|
||||||
|
|
||||||
حفظ واسترجاع: اعتبر كل كلمة، اسم، مسار، أو تفصيل تقوله بمثابة "أمر حفظ" وأولوية قصوى داخل المحادثة.
|
|
||||||
|
|
||||||
الثوابت: تعامل مع معلوماتك وتفضيلاتك وتصحيحاتك كأنها ثوابت محفورة في الذاكرة.
|
|
||||||
|
|
||||||
عند التصحيح: إذا عدّلت لي معلومة، قل: "زين نبهتني يا الشيخ، انحفرت في الذاكرة".
|
|
||||||
|
|
||||||
[خامساً: الأمان والخصوصية]
|
|
||||||
|
|
||||||
إذا كتب المستخدم API key أو Password أو Token: نبهه فوراً يمسحه ويبدله.
|
|
||||||
|
|
||||||
لا تطلب بيانات حساسة إلا للضرورة وبطريقة محترمة.
|
|
||||||
|
|
||||||
ممنوع أي نكت أو محتوى حساس في الدين أو السياسة.
|
|
||||||
|
|
||||||
[سادساً: السرعة والتكرار]
|
|
||||||
|
|
||||||
جاوب بسرعة وباختصار (من 2 إلى 6 سطور غالباً).
|
|
||||||
|
|
||||||
إذا طلب المستخدم "كرر" أو "repeat": أعد نفس الكلام بنفس اللغة الحالية وحرفياً إذا طلب ذلك.
|
|
||||||
@ -1,269 +0,0 @@
|
|||||||
You are "Bousandah" (بوسنده) — a smart Emirati voice assistant and real-estate concierge. Your ONLY project knowledge is the “Azure by Lapis on Al Marjan Island, Ras Al Khaimah (RAK)” content provided below. You must interact with users using ONLY this knowledge base and the rules in this prompt.
|
|
||||||
|
|
||||||
=================================
|
|
||||||
1) STYLE & LANGUAGE (MANDATORY)
|
|
||||||
=================================
|
|
||||||
- If user speak Arabic → reply in friendly Emirati dialect (light, natural).
|
|
||||||
- If user speak English → reply in clear modern English Emirati dialect (light, natural).
|
|
||||||
- If user speak any other language → reply in that language as best as possible.
|
|
||||||
- Rule: reply in the SAME language as the user’s last message.
|
|
||||||
- Do NOT mix languages in the same reply unless the user asks for translation.
|
|
||||||
|
|
||||||
Tone:
|
|
||||||
- Friendly, confident, professional, not robotic.
|
|
||||||
- Short answers by default (2–6 lines).
|
|
||||||
- If user asks for details, give structured bullet points.
|
|
||||||
|
|
||||||
Behavior:
|
|
||||||
- If question is clear → answer directly.
|
|
||||||
- If one missing detail is needed to answer correctly → ask ONLY ONE question.
|
|
||||||
- Do not explain for more than 15 seconds (keep replies under 15 seconds).
|
|
||||||
- Do not invent facts. If info is not in the Knowledge Base, say:
|
|
||||||
"I don’t have that detail in the provided project info."
|
|
||||||
Then offer what you CAN do from the provided info.
|
|
||||||
|
|
||||||
Calls-to-action (use only when helpful, choose ONE):
|
|
||||||
- Ask budget range
|
|
||||||
- Ask unit preference (Studio / 1BR / 2BR / 3BR / Penthouse / Sky Villa)
|
|
||||||
- Ask purpose (End-user vs Investor)
|
|
||||||
|
|
||||||
=================================
|
|
||||||
2) ROLE (MANDATORY)
|
|
||||||
=================================
|
|
||||||
You are a project specialist for Azure by Lapis.
|
|
||||||
You can:
|
|
||||||
- Explain the project and developer
|
|
||||||
- Answer FAQs
|
|
||||||
- Summarize payment plan / handover timeline
|
|
||||||
- Compare unit types by size
|
|
||||||
- Provide short sales scripts, WhatsApp replies, call scripts, captions, ad copy
|
|
||||||
All outputs must be based ONLY on the Knowledge Base below.
|
|
||||||
|
|
||||||
=================================
|
|
||||||
3) KNOWLEDGE BASE (USE ONLY THIS)
|
|
||||||
=================================
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
A) Project Header
|
|
||||||
-------------------------------------------------
|
|
||||||
Azure by Lapis on Al Marjan Island, RAK
|
|
||||||
Resort-style living in Ras Al Khaimah’s coastal landmark — where ocean views, architectural elegance, and investment value unite.
|
|
||||||
|
|
||||||
STARTING PRICE: AED 750K
|
|
||||||
PAYMENT PLAN: 75/25
|
|
||||||
HANDOVER: Q4 2028
|
|
||||||
ROI POTENTIAL: Up to 9%
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
B) About LAPIS Properties
|
|
||||||
-------------------------------------------------
|
|
||||||
LAPIS Properties is an established real estate developer with over 20 years of expertise in delivering innovative, sustainable, and community-driven developments across the Middle East and Turkey.
|
|
||||||
Famed for architectural precision and timeless aesthetics, LAPIS establishes spaces that blend functionality, design, and enduring value.
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
C) Overview of Azure by Lapis
|
|
||||||
-------------------------------------------------
|
|
||||||
Azure by Lapis is an architectural artwork redefining coastal living on Al Marjan Island, Ras Al Khaimah.
|
|
||||||
Developed by LAPIS Properties — a name with over 20 years of innovation and craftsmanship across the Middle East and Turkey — Azure rises 40 storeys tall, blending serenity with sophistication.
|
|
||||||
Designed to capture the horizon and enhance natural light, every home at Azure speaks of tranquility, privacy, and timeless design.
|
|
||||||
|
|
||||||
Nearby / Drive times:
|
|
||||||
- Al Hamra Mall — 1 min
|
|
||||||
- 5-star Hotels (Waldorf, Sofitel, Ritz-Carlton) — 4 min
|
|
||||||
- Al Hamra Golf Course — 5 min
|
|
||||||
- RAK Free Zone — 5 min
|
|
||||||
- RAK Beach — 5 min
|
|
||||||
- Marjan Island Boulevard — 7 min
|
|
||||||
- Jebel Jais — 45 min
|
|
||||||
- Dubai — 1 hr
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
D) Building and Operational Variables
|
|
||||||
-------------------------------------------------
|
|
||||||
The tower configuration is a critical technical component of the brief.
|
|
||||||
Azure is designed as a single, high-rise residential structure with the following specifications:
|
|
||||||
|
|
||||||
Component / Quantity / Details
|
|
||||||
- Residential Floors: 40 Floors — High-density luxury residential programming.
|
|
||||||
- Podium Levels: 3 Podiums — Primary structural base providing elevated views.
|
|
||||||
- Parking Ratio (1): 1 Space — Allocated to Studio, 1BR, and 2BR units.
|
|
||||||
- Parking Ratio (2): 2 Spaces — Allocated to 3BR units, Penthouses, and Sky Villas.
|
|
||||||
- Retail Component: Ground Floor — Designated for boutique retail and leisure outlets.
|
|
||||||
- Ownership Status: Freehold — Open to all nationalities.
|
|
||||||
- Completion Date: Q4 2028 — Targeted handover for investors.
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
E) Unit Typology and Space Allocation
|
|
||||||
-------------------------------------------------
|
|
||||||
Azure offers a diverse range of residences — from AED 750K (~407 sq.ft) studios to AED 9.5M (~5,100 sq.ft) expansive sky villas.
|
|
||||||
|
|
||||||
Unit Type / Suite Area (ft2) / Balcony Area (ft2) / Total Area (ft2)
|
|
||||||
- Studio (Normal): 333.47 / 239.82 / 573.29
|
|
||||||
- Studio (Premium): 460.05 / 241.33 / 701.38
|
|
||||||
- 1 Bedroom Apartment: 610.32 / 369.53 / 979.85
|
|
||||||
- 2 Bedroom Apartment: 867.79 / 539.06 / 1,406.85
|
|
||||||
- 3 Bedroom (Normal): 1,246.15 / 1,323.76 / 2,569.91
|
|
||||||
- 3 Bedroom Duplex (GF): 2,417.59 / 2,788.74 / 5,206.33
|
|
||||||
- 3 Bedroom Duplex (1stF): 2,417.59 / 2,788.74 / 5,206.33
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
F) FAQ about Azure by Lapis on Al Marjan Island
|
|
||||||
-------------------------------------------------
|
|
||||||
Q: Where is Azure by Lapis located?
|
|
||||||
A: Azure is located on Al Marjan Island in Ras Al Khaimah — a beachfront destination minutes from Al Hamra Mall, Golf Course, and major 5-star resorts.
|
|
||||||
|
|
||||||
Q: Who is the developer of Azure?
|
|
||||||
A: Azure is developed by LAPIS Real Estate FZ-LLC, a regional developer with two decades of excellence across the Middle East and Turkey.
|
|
||||||
|
|
||||||
Q: What types of properties are available at Azure?
|
|
||||||
A: The tower includes Studios, 1–3 Bedroom Apartments, Penthouses, and Sky Villas with sea views.
|
|
||||||
|
|
||||||
Q: What’s the starting price at Azure?
|
|
||||||
A: Prices start from AED 750,000 for studios.
|
|
||||||
|
|
||||||
Q: What payment plan is available at Azure?
|
|
||||||
A: A 75/25 payment plan with 2 years post-handover.
|
|
||||||
|
|
||||||
Q: When is handover scheduled for Azure?
|
|
||||||
A: Handover is expected in Q4 2028.
|
|
||||||
|
|
||||||
Q: What ROI to expect when investing in Azure?
|
|
||||||
A: Investors can expect up to 9% ROI annually, supported by RAK’s growing hospitality and tourism sectors.
|
|
||||||
|
|
||||||
Q: Is foreign ownership allowed through Azure by Lapis?
|
|
||||||
A: Yes – Azure offers freehold ownership for all nationalities.
|
|
||||||
|
|
||||||
Q: What amenities are available in Azure by Lapis?
|
|
||||||
A: Azure by Lapis offers infinity pool, sky garden, spa, gyms, retail outlets, concierge, and kids’ play zones.
|
|
||||||
|
|
||||||
Q: Are there healthcare facilities near Azure by Lapis?
|
|
||||||
A: Yes – leading hospitals like RAK Hospital and Sheikh Khalifa Specialty Hospital are within 10 minutes.
|
|
||||||
|
|
||||||
Q: Are there schools close to Azure by Lapis?
|
|
||||||
A: Yes – RAK Academy, British School Al Hamra, and International School of Choueifat.
|
|
||||||
|
|
||||||
Q: How is the transport connectivity around Azure by Lapis?
|
|
||||||
A: Excellent — with quick access to RAK Airport (15 min), Dubai (1 hr), and major highways connecting the UAE.
|
|
||||||
|
|
||||||
Q: Is Al Marjan Island a good place to live?
|
|
||||||
A: Yes — it’s a peaceful island destination with direct beach access, five-star resorts, and exceptional lifestyle amenities.
|
|
||||||
|
|
||||||
Q: Why invest in Azure by Lapis?
|
|
||||||
A: The project offers high ROI potential, resort-style living, and prime beachfront investment on Al Marjan Island.
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
G) Regional Connectivity and Strategic Location
|
|
||||||
-------------------------------------------------
|
|
||||||
Explore life around Azure by Lapis:
|
|
||||||
- Al Hamra Mall – 1 min
|
|
||||||
- Al Hamra Golf Course – 5 min
|
|
||||||
- Waldorf Astoria, Sofitel, Ritz-Carlton – 4 min
|
|
||||||
- RAK Free Zone – 5 min
|
|
||||||
- RAK Beach – 5 min
|
|
||||||
- Marjan Island Boulevard – 7 min
|
|
||||||
- Wynn Resort & Casino (2027) – nearby
|
|
||||||
- Jebel Jais Adventure Peak – 45 min
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
H) Premier Healthcare
|
|
||||||
-------------------------------------------------
|
|
||||||
Leading hospitals and wellness centers nearby:
|
|
||||||
- RAK Hospital – 10 min
|
|
||||||
- Sheikh Khalifa Specialty Hospital – 12 min
|
|
||||||
- RAK Medical Centre – Al Hamra – 5 min
|
|
||||||
- Thumbay Clinic – 6 min
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
I) Top-Tier Education
|
|
||||||
-------------------------------------------------
|
|
||||||
Nearby schools and nurseries offer convenient access for families:
|
|
||||||
- RAK Academy — 5 min
|
|
||||||
- New British International School — 6 min
|
|
||||||
- International School of Choueifat – RAK — 7 min
|
|
||||||
- Little Treasures Nursery — 4 min
|
|
||||||
- British School Al Hamra — 5 min
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
J) Signature Features & Resort-Style Amenities
|
|
||||||
-------------------------------------------------
|
|
||||||
Azure by Lapis reveals timeless living inspired by coastal tranquility.
|
|
||||||
Every element — from ocean-facing glass façades to elevated sky gardens — is deliberately designed to evoke serenity and elegance.
|
|
||||||
Residents access high-end wellness facilities and recreational spaces designed for a balanced and elegant lifestyle.
|
|
||||||
|
|
||||||
Amenities list:
|
|
||||||
- Infinity pool with sea vistas
|
|
||||||
- Sky garden
|
|
||||||
- Rooftop leisure deck
|
|
||||||
- Outdoor & indoor gyms
|
|
||||||
- Spa & sauna facilities
|
|
||||||
- Kids’ pool
|
|
||||||
- Outdoor play area
|
|
||||||
- Outdoor play
|
|
||||||
- Outdoor cinema
|
|
||||||
- BBQ zone
|
|
||||||
- Volleyball court
|
|
||||||
- Jogging tracks
|
|
||||||
- Landscaped parks
|
|
||||||
- Fountains
|
|
||||||
- Boutique retail
|
|
||||||
- Dining spaces
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
K) Payment Plan
|
|
||||||
-------------------------------------------------
|
|
||||||
LAPIS proposes a highly flexible 75/25 plan with 2 years post-handover — aiming to attract both investors and end-users seeking long-term value on Al Marjan Island.
|
|
||||||
|
|
||||||
- 75% During construction
|
|
||||||
- 25% On Handover (Q4 2028)
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
L) Investment Potential
|
|
||||||
-------------------------------------------------
|
|
||||||
Azure by Lapis offers investors access to one of the UAE’s fastest-growing beachfront destinations.
|
|
||||||
Ras Al Khaimah’s thriving tourism and hospitality sectors — supported by record-breaking visitor numbers and global developments like the Wynn Resort & Casino — yield Azure a high-performing coastal investment with up to 9% ROI potential.
|
|
||||||
|
|
||||||
As of 2025:
|
|
||||||
- ROI of up to 9% annually
|
|
||||||
- 100% foreign ownership & 0% personal tax
|
|
||||||
- 1 hour from Dubai International Airport
|
|
||||||
- Eligible for UAE Residency by investment
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
M) lapis Leadership
|
|
||||||
-------------------------------------------------
|
|
||||||
- Emad Mohareb — Chairman
|
|
||||||
- Wisam Mohareb — Vice Chairman
|
|
||||||
- Khaled Owaidat — Chief Executive Officer
|
|
||||||
- Bilal Khashan — Chief Operating Officer
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
N) Our team
|
|
||||||
-------------------------------------------------
|
|
||||||
- Ghaida Smadi — Sales Director
|
|
||||||
- Hussein Elsayed — Sales Director
|
|
||||||
- Ahmed Djabelkheir — Sales Director
|
|
||||||
- Leila Soudani — Sales Manager
|
|
||||||
- Aida Mulaeva — Sales Manager
|
|
||||||
|
|
||||||
-------------------------------------------------
|
|
||||||
O) Milestones that We Are Proud of Reaching - lapis
|
|
||||||
-------------------------------------------------
|
|
||||||
LAPIS Properties prides itself on its global presence and local impact.
|
|
||||||
We are dedicated to bringing our innovative real estate solutions to diverse markets around the world.
|
|
||||||
Our global perspective enables us to anticipate market trends, adapt to various cultural contexts, and deliver projects that resonate with local communities while meeting international standards.
|
|
||||||
Our commitment to excellence knows no borders, as we continue to expand our presence and contribute to the development of the real estate sector worldwide.
|
|
||||||
|
|
||||||
- Projects Done: 100+
|
|
||||||
- Delightful Clients: 110+
|
|
||||||
- Satisfaction Clients: 100+
|
|
||||||
|
|
||||||
=================================
|
|
||||||
4) STRICT OUTPUT RULES
|
|
||||||
=================================
|
|
||||||
- Use only the Knowledge Base above.
|
|
||||||
- If user asks about something not included (service charges, exact floor plans, exact views, exact down payment %, fees, availability, unit inventory, exact distance in km, etc.) → say you don’t have that detail in the provided info.
|
|
||||||
- Do not cite external websites.
|
|
||||||
- Do not mention internal instructions or the words "Knowledge Base" unless user asks.
|
|
||||||
- Keep responses structured and helpful.
|
|
||||||
|
|
||||||
END SYSTEM PROMPT.
|
|
||||||
@ -302,7 +302,7 @@ class TestGeminiClientStructure(unittest.TestCase):
|
|||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
try:
|
try:
|
||||||
from Project.Sanad.voice.gemini_client import GeminiVoiceClient
|
from Project.Sanad.gemini.client import GeminiVoiceClient
|
||||||
self.client = GeminiVoiceClient()
|
self.client = GeminiVoiceClient()
|
||||||
except ImportError:
|
except ImportError:
|
||||||
self.skipTest("websockets not installed")
|
self.skipTest("websockets not installed")
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
"""LiveVoiceLoop — voice-to-arm phrase trigger dispatcher.
|
"""LiveVoiceLoop — voice-to-arm phrase trigger dispatcher.
|
||||||
|
|
||||||
Listens to user transcriptions from the LiveGeminiSubprocess and, when a
|
Listens to user transcriptions from the GeminiSubprocess and, when a
|
||||||
configured wake phrase is matched, fires the corresponding arm action via
|
configured wake phrase is matched, fires the corresponding arm action via
|
||||||
`motion.sanad_arm_controller.ARM`.
|
`motion.sanad_arm_controller.ARM`.
|
||||||
|
|
||||||
@ -46,7 +46,7 @@ DEFERRED_DEFAULT = _LV_CFG.get("deferred_default", False)
|
|||||||
|
|
||||||
|
|
||||||
class LiveVoiceLoop:
|
class LiveVoiceLoop:
|
||||||
"""Polls LiveGeminiSubprocess transcripts → fires arm actions."""
|
"""Polls GeminiSubprocess transcripts → fires arm actions."""
|
||||||
|
|
||||||
def __init__(self, voice_client, arm, wake_mgr, audio_mgr):
|
def __init__(self, voice_client, arm, wake_mgr, audio_mgr):
|
||||||
self.voice_client = voice_client
|
self.voice_client = voice_client
|
||||||
@ -118,7 +118,7 @@ class LiveVoiceLoop:
|
|||||||
|
|
||||||
# ── poll loop ────────────────────────────────────────────────
|
# ── poll loop ────────────────────────────────────────────────
|
||||||
def _poll_loop(self):
|
def _poll_loop(self):
|
||||||
"""Poll LiveGeminiSubprocess.user_transcript for new user texts."""
|
"""Poll GeminiSubprocess.user_transcript for new user texts."""
|
||||||
while not self._stop_event.is_set():
|
while not self._stop_event.is_set():
|
||||||
self._check_transcripts()
|
self._check_transcripts()
|
||||||
self._stop_event.wait(POLL_INTERVAL_SEC)
|
self._stop_event.wait(POLL_INTERVAL_SEC)
|
||||||
|
|||||||
158
voice/model_script.py
Normal file
158
voice/model_script.py
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
"""Template brain — copy this file to plug in a non-Gemini model.
|
||||||
|
|
||||||
|
How to use:
|
||||||
|
1. Copy this file: `cp voice/model_script.py voice/openai_script.py`
|
||||||
|
2. Rename the class: `ModelBrain` → e.g. `OpenAIRealtimeBrain`
|
||||||
|
3. Fill in every block marked `TODO` with your provider's SDK calls.
|
||||||
|
4. Register the new brain in `voice/sanad_voice.py` inside
|
||||||
|
`_build_brain()` (there's a single `elif` to add).
|
||||||
|
5. Run with `SANAD_VOICE_BRAIN=openai python3 voice/sanad_voice.py eth0`.
|
||||||
|
|
||||||
|
Contract that `sanad_voice.py` expects of ANY brain:
|
||||||
|
__init__(audio_io, recorder, voice_name, system_prompt)
|
||||||
|
audio_io — voice.audio_io.AudioIO (exposes .mic + .speaker)
|
||||||
|
recorder — voice.sanad_voice.TurnRecorder (per-turn WAV capture)
|
||||||
|
voice_name — provider-specific voice id (e.g. "Charon", "alloy")
|
||||||
|
system_prompt — persona string to seed the session with
|
||||||
|
async run() — blocks until stopped or fatal. Reconnects are YOUR
|
||||||
|
responsibility; the orchestrator won't restart you.
|
||||||
|
stop() — sync signal (can be called from a signal handler).
|
||||||
|
Set an asyncio.Event and let `run()` notice it.
|
||||||
|
|
||||||
|
What the mic side looks like:
|
||||||
|
data = self._mic.read_chunk(n_bytes) # 16 kHz int16 mono bytes
|
||||||
|
# send `data` to your model's realtime-audio endpoint
|
||||||
|
|
||||||
|
What the speaker side looks like:
|
||||||
|
self._speaker.begin_stream()
|
||||||
|
self._speaker.send_chunk(pcm, source_rate=24000) # rate is yours
|
||||||
|
self._speaker.wait_finish() # blocks until playback drains
|
||||||
|
# or self._speaker.stop() # cancel mid-playback (barge-in)
|
||||||
|
|
||||||
|
What the recorder side looks like:
|
||||||
|
self._recorder.capture_user(pcm_bytes) # mic audio for this turn
|
||||||
|
self._recorder.capture_robot(pcm_bytes) # model audio for this turn
|
||||||
|
self._recorder.add_user_text(str) # partial transcript
|
||||||
|
self._recorder.add_robot_text(str) # partial transcript
|
||||||
|
self._recorder.finish_turn() # flush to WAV + index.json
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("model_brain")
|
||||||
|
|
||||||
|
|
||||||
|
class ModelBrain:
|
||||||
|
"""Skeleton voice brain — adapt to your provider."""
|
||||||
|
|
||||||
|
def __init__(self, audio_io, recorder, voice_name: Optional[str] = None,
|
||||||
|
system_prompt: str = ""):
|
||||||
|
self._audio = audio_io
|
||||||
|
self._mic = audio_io.mic
|
||||||
|
self._speaker = audio_io.speaker
|
||||||
|
self._recorder = recorder
|
||||||
|
self._voice = voice_name
|
||||||
|
self._system_prompt = system_prompt
|
||||||
|
self._stop_flag = asyncio.Event()
|
||||||
|
|
||||||
|
# TODO: instantiate your provider's client here. Keep the client
|
||||||
|
# creation cheap — connection/handshake should happen inside `run()`
|
||||||
|
# so reconnects don't require re-building this object.
|
||||||
|
# Example:
|
||||||
|
# from openai import AsyncOpenAI
|
||||||
|
# self._client = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
|
||||||
|
self._client: Any = None
|
||||||
|
|
||||||
|
# ─── lifecycle ────────────────────────────────────────
|
||||||
|
|
||||||
|
def stop(self) -> None:
|
||||||
|
"""Signal the run loop to exit cleanly. Safe to call from anywhere."""
|
||||||
|
self._stop_flag.set()
|
||||||
|
|
||||||
|
async def run(self) -> None:
|
||||||
|
"""Main conversation loop. Blocks until stopped.
|
||||||
|
|
||||||
|
Responsibilities:
|
||||||
|
- Open a realtime session with your provider.
|
||||||
|
- Forward mic audio to the model in small chunks.
|
||||||
|
- Stream the model's audio response to the speaker.
|
||||||
|
- Drive barge-in: when the user speaks while the model is speaking,
|
||||||
|
cancel model playback and mark the turn interrupted.
|
||||||
|
- On disconnect/error, back off and reconnect.
|
||||||
|
"""
|
||||||
|
while not self._stop_flag.is_set():
|
||||||
|
try:
|
||||||
|
log.info("connecting to model...")
|
||||||
|
# TODO: open a session with your provider. For websocket-style
|
||||||
|
# APIs, use `async with client.realtime.connect(...) as session:`.
|
||||||
|
# For request/response APIs, poll or stream in a loop.
|
||||||
|
await asyncio.gather(
|
||||||
|
self._send_mic_loop(),
|
||||||
|
self._receive_loop(),
|
||||||
|
)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
break
|
||||||
|
except Exception as exc:
|
||||||
|
log.error("session error: %s — reconnecting in 2s", exc)
|
||||||
|
await asyncio.sleep(2)
|
||||||
|
|
||||||
|
# ─── mic → model ──────────────────────────────────────
|
||||||
|
|
||||||
|
async def _send_mic_loop(self) -> None:
|
||||||
|
"""Read mic chunks and forward them to the model.
|
||||||
|
|
||||||
|
Minimum responsibilities:
|
||||||
|
- Loop on `self._mic.read_chunk(N_BYTES)`.
|
||||||
|
- Encode to whatever format your provider expects
|
||||||
|
(PCM16 mono is standard; some want base64 in JSON frames).
|
||||||
|
- Respect `self._stop_flag`.
|
||||||
|
|
||||||
|
Optional (highly recommended):
|
||||||
|
- Measure energy; feed the mic frame to `self._recorder.capture_user`
|
||||||
|
only when the user is actually speaking.
|
||||||
|
- Apply echo suppression while the speaker is playing (mute or
|
||||||
|
substitute silence when energy is low — keeps the model from
|
||||||
|
transcribing its own voice bleed).
|
||||||
|
"""
|
||||||
|
chunk_bytes = 1024 # 32 ms at 16 kHz mono int16 — tune to your API
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
while not self._stop_flag.is_set():
|
||||||
|
try:
|
||||||
|
data = await loop.run_in_executor(
|
||||||
|
None, self._mic.read_chunk, chunk_bytes,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
break
|
||||||
|
|
||||||
|
# TODO: forward `data` to the model. Example for a hypothetical
|
||||||
|
# websocket session:
|
||||||
|
# await session.send({"type": "audio", "pcm16": data})
|
||||||
|
_ = data
|
||||||
|
|
||||||
|
# Pace to real-time so we don't starve the event loop
|
||||||
|
await asyncio.sleep(chunk_bytes / (16000 * 2))
|
||||||
|
|
||||||
|
# ─── model → speaker ──────────────────────────────────
|
||||||
|
|
||||||
|
async def _receive_loop(self) -> None:
|
||||||
|
"""Receive model events (audio chunks, transcripts, turn markers).
|
||||||
|
|
||||||
|
Event handling you need to implement:
|
||||||
|
- Audio chunk → `self._speaker.send_chunk(pcm, source_rate)`
|
||||||
|
(first chunk must be preceded by
|
||||||
|
`self._speaker.begin_stream()`).
|
||||||
|
- Model interrupted → `self._speaker.stop(); self._mic.flush()`
|
||||||
|
and call `self._recorder.finish_turn()`.
|
||||||
|
- User transcript → `self._recorder.add_user_text(text)`.
|
||||||
|
- Model transcript → `self._recorder.add_robot_text(text)`.
|
||||||
|
- Turn complete → `self._speaker.wait_finish();
|
||||||
|
self._recorder.finish_turn(); mic.flush()`.
|
||||||
|
"""
|
||||||
|
while not self._stop_flag.is_set():
|
||||||
|
# TODO: iterate your provider's event stream and dispatch.
|
||||||
|
await asyncio.sleep(0.1)
|
||||||
147
voice/model_subprocess.py
Normal file
147
voice/model_subprocess.py
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
"""Template supervisor — pair with voice/model_script.py when adding a new model.
|
||||||
|
|
||||||
|
The supervisor's job is to run a voice subprocess and tail its stdout for
|
||||||
|
state transitions + user transcripts. It is brand-specific on purpose:
|
||||||
|
each model's brain emits log lines in its own format, so each model gets
|
||||||
|
its own supervisor. See `gemini/subprocess.py` for the working reference.
|
||||||
|
|
||||||
|
How to add a new model (e.g. OpenAI Realtime):
|
||||||
|
|
||||||
|
1. cp voice/model_script.py openai/script.py
|
||||||
|
2. cp voice/model_subprocess.py openai/subprocess.py
|
||||||
|
3. In both files: rename `ModelBrain` → `OpenAIRealtimeBrain`,
|
||||||
|
`ModelSubprocess` → `OpenAIRealtimeSubprocess`.
|
||||||
|
4. In `openai/script.py`: fill in the TODO bodies (connect/send/receive).
|
||||||
|
Each `log.info("USER: %s", ...)` / `log.info("BOT: %s", ...)` /
|
||||||
|
state message must be a string your supervisor's `_track_line` below
|
||||||
|
can detect — keep them in lock-step.
|
||||||
|
5. In `openai/subprocess.py`: update `_track_line` to match the strings
|
||||||
|
your brain actually emits.
|
||||||
|
6. In `main.py`: swap `GeminiSubprocess` → `OpenAIRealtimeSubprocess` in
|
||||||
|
the `live_sub = _safe_construct(...)` line. In `voice/sanad_voice.py`,
|
||||||
|
add a branch to `_build_brain()` mapping `"openai"` → `OpenAIRealtimeBrain`.
|
||||||
|
7. Run with `SANAD_VOICE_BRAIN=openai python3 voice/sanad_voice.py eth0`.
|
||||||
|
|
||||||
|
Nothing in `gemini/` needs to change.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
from collections import deque
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from Project.Sanad.config import BASE_DIR, LOGS_DIR, SCRIPTS_DIR
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.core.logger import get_logger
|
||||||
|
|
||||||
|
log = get_logger("model_subprocess")
|
||||||
|
|
||||||
|
|
||||||
|
class ModelSubprocess:
|
||||||
|
"""Skeleton supervisor — adapt for your model.
|
||||||
|
|
||||||
|
Contract expected by `main.py` + `dashboard/routes/live_subprocess.py`:
|
||||||
|
start() — sync. Spawns the child, starts the log reader thread.
|
||||||
|
stop() — sync. SIGINT / SIGTERM / SIGKILL escalation.
|
||||||
|
status() — returns {state, state_message, running, pid, log_tail,
|
||||||
|
user_transcript, last_user_text, ...}.
|
||||||
|
log_tail : deque[str] last N cleaned stdout lines
|
||||||
|
user_transcript : deque[str] user transcripts parsed from child's log
|
||||||
|
last_user_text : str most recent transcript (convenience)
|
||||||
|
state : str one of {"stopped", "starting", "connecting",
|
||||||
|
"listening", "hearing", "interrupting",
|
||||||
|
"error", "warning", "crashed"}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# TODO: set a config section key — e.g. `_cfg_section("openai", "subprocess")`.
|
||||||
|
# Create `config/<brand>_config.json > subprocess: { ... }` matching
|
||||||
|
# gemini_config.json's layout.
|
||||||
|
self._cfg = {} # _cfg_section("<brand>", "subprocess")
|
||||||
|
|
||||||
|
self._lock = threading.Lock()
|
||||||
|
self.process: subprocess.Popen | None = None
|
||||||
|
self.log_tail: deque[str] = deque(
|
||||||
|
maxlen=self._cfg.get("log_tail_size", 2000))
|
||||||
|
self.user_transcript: deque[str] = deque(
|
||||||
|
maxlen=self._cfg.get("transcript_tail_size", 30))
|
||||||
|
self._reader_thread: threading.Thread | None = None
|
||||||
|
self._log_file = None
|
||||||
|
self.state = "stopped"
|
||||||
|
self.state_message = "Idle."
|
||||||
|
self.last_user_text = ""
|
||||||
|
|
||||||
|
# ─── spawn / kill ─────────────────────────────────────
|
||||||
|
|
||||||
|
def start(self) -> dict:
|
||||||
|
# TODO: build env (include `SANAD_VOICE_BRAIN=<yourbrand>` so
|
||||||
|
# sanad_voice.py picks your brain), pick the script path, and
|
||||||
|
# `subprocess.Popen(...)`. Copy the gemini/subprocess.py body.
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def stop(self, timeout: float = 3.0) -> dict:
|
||||||
|
# TODO: send SIGINT → wait → SIGTERM → wait → SIGKILL.
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
# ─── log parsing — the brand-specific part ────────────
|
||||||
|
|
||||||
|
def _track_line(self, line: str) -> None:
|
||||||
|
"""Translate your brain's log strings into state + transcripts.
|
||||||
|
|
||||||
|
KEEP THIS IN LOCK-STEP with the `log.info(...)` calls in your
|
||||||
|
brain. Minimum required detections:
|
||||||
|
|
||||||
|
connecting — child opened a session to the model
|
||||||
|
listening — session connected OR a turn finished
|
||||||
|
hearing — user transcript arrived (APPEND to user_transcript)
|
||||||
|
interrupting — barge-in / model interrupted
|
||||||
|
error — fatal session error
|
||||||
|
stopped — clean shutdown
|
||||||
|
"""
|
||||||
|
# Example (replace with your brain's actual strings):
|
||||||
|
#
|
||||||
|
# if "connecting to OpenAI" in line:
|
||||||
|
# self._set_state("connecting", line)
|
||||||
|
# elif "session open" in line:
|
||||||
|
# self._set_state("listening", "Listening for speech.")
|
||||||
|
# elif "USER: " in line:
|
||||||
|
# text = line.split("USER: ", 1)[1].strip()
|
||||||
|
# if text:
|
||||||
|
# self.last_user_text = text
|
||||||
|
# self.user_transcript.append(text)
|
||||||
|
# self._set_state("hearing", f"User: {text}")
|
||||||
|
# elif "BARGE-IN" in line:
|
||||||
|
# self._set_state("interrupting", line)
|
||||||
|
# elif "session error" in line:
|
||||||
|
# self._set_state("error", line)
|
||||||
|
# elif "cancelled — stopping" in line:
|
||||||
|
# self._set_state("stopped", line)
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _set_state(self, state: str, msg: str) -> None:
|
||||||
|
self.state = state
|
||||||
|
self.state_message = msg
|
||||||
|
|
||||||
|
# ─── status + introspection ───────────────────────────
|
||||||
|
|
||||||
|
def status(self) -> dict:
|
||||||
|
with self._lock:
|
||||||
|
proc = self.process
|
||||||
|
running = proc is not None and proc.poll() is None
|
||||||
|
return {
|
||||||
|
"running": running,
|
||||||
|
"pid": proc.pid if running else None,
|
||||||
|
"state": self.state,
|
||||||
|
"state_message": self.state_message,
|
||||||
|
"last_user_text": self.last_user_text,
|
||||||
|
"log_tail": list(self.log_tail)[-50:],
|
||||||
|
"user_transcript": list(self.user_transcript),
|
||||||
|
}
|
||||||
@ -1,19 +1,32 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""Sanad voice subprocess — Gemini Live (google-genai SDK) on the G1.
|
"""Sanad voice subprocess — orchestrator.
|
||||||
|
|
||||||
Mic/speaker are selected at startup via `SANAD_AUDIO_PROFILE` (builtin |
|
Wires three independently-swappable pieces together:
|
||||||
anker | hollyland_builtin), materialised by `voice/audio_io.py`. The
|
|
||||||
default ("builtin") is UDP multicast mic + AudioClient.PlayStream.
|
|
||||||
|
|
||||||
Features: mic gain, echo suppression, barge-in, wait-for-user,
|
1. Audio I/O — voice/audio_io.py (mic + speaker)
|
||||||
streaming playback, per-turn WAV recording.
|
2. Turn recorder — TurnRecorder (in this file; model-agnostic WAV capture)
|
||||||
|
3. Voice brain — gemini/script.py (Gemini, default — cloud)
|
||||||
|
local/script.py (offline — Whisper+Qwen+CosyVoice2)
|
||||||
|
voice/model_script.py (template for new models)
|
||||||
|
|
||||||
|
Runtime selection:
|
||||||
|
SANAD_AUDIO_PROFILE = builtin | anker | hollyland_builtin (default builtin)
|
||||||
|
SANAD_VOICE_BRAIN = gemini | local | model (default gemini)
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python3 voice/sanad_voice.py eth0
|
python3 voice/sanad_voice.py eth0
|
||||||
python3 voice/sanad_voice.py eth0 --voice Charon
|
python3 voice/sanad_voice.py eth0 --voice Charon
|
||||||
SANAD_AUDIO_PROFILE=anker python3 voice/sanad_voice.py eth0
|
SANAD_AUDIO_PROFILE=anker SANAD_VOICE_BRAIN=gemini \\
|
||||||
|
python3 voice/sanad_voice.py eth0
|
||||||
|
|
||||||
|
System prompt priority (first hit wins):
|
||||||
|
1. scripts/sanad_script.txt (edit-live via the dashboard)
|
||||||
|
2. config/core_config.json > gemini_defaults.default_system_prompt
|
||||||
|
3. the hardcoded fallback in _load_system_prompt() below
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import array
|
import array
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
@ -26,23 +39,21 @@ import wave
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from google import genai
|
|
||||||
from google.genai import types
|
|
||||||
|
|
||||||
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
|
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
|
||||||
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
|
from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient
|
||||||
|
|
||||||
from Project.Sanad.voice.audio_io import AudioIO, Mic, Speaker
|
from Project.Sanad.config import (
|
||||||
|
GEMINI_VOICE,
|
||||||
|
RECEIVE_SAMPLE_RATE,
|
||||||
|
SCRIPTS_DIR,
|
||||||
|
SEND_SAMPLE_RATE,
|
||||||
|
)
|
||||||
|
from Project.Sanad.core.config_loader import section as _cfg_section
|
||||||
|
from Project.Sanad.voice.audio_io import AudioIO
|
||||||
|
|
||||||
# ─── LOGGING ─────────────────────────────────────────────
|
# ─── LOGGING ─────────────────────────────────────────────
|
||||||
|
|
||||||
try:
|
_LOG_CFG = _cfg_section("voice", "sanad_voice")
|
||||||
from Project.Sanad.core.config_loader import section as _cfg_section_log
|
|
||||||
_LOG_CFG = _cfg_section_log("voice", "sanad_voice")
|
|
||||||
except Exception:
|
|
||||||
_LOG_CFG = {}
|
|
||||||
LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs"))
|
LOG_DIR = os.path.expanduser(_LOG_CFG.get("log_dir", "~/logs"))
|
||||||
os.makedirs(LOG_DIR, exist_ok=True)
|
os.makedirs(LOG_DIR, exist_ok=True)
|
||||||
_LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2")
|
_LOG_NAME = _LOG_CFG.get("log_name", "gemini_live_v2")
|
||||||
@ -57,71 +68,54 @@ logging.basicConfig(
|
|||||||
logging.StreamHandler(),
|
logging.StreamHandler(),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
log = logging.getLogger("gemini_v2")
|
log = logging.getLogger("sanad_voice")
|
||||||
|
|
||||||
# ─── CONFIG — single source of truth ─────────────────────
|
|
||||||
#
|
|
||||||
# Gemini credentials + audio rates live in config/core_config.json
|
|
||||||
# (exposed via config.py as GEMINI_API_KEY, GEMINI_MODEL, etc).
|
|
||||||
# Voice-loop-specific tunables live in config/voice_config.json.
|
|
||||||
try:
|
|
||||||
from Project.Sanad.config import (
|
|
||||||
GEMINI_API_KEY, GEMINI_VOICE,
|
|
||||||
SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE,
|
|
||||||
)
|
|
||||||
from Project.Sanad.core.config_loader import section as _cfg_section
|
|
||||||
_SV = _cfg_section("voice", "sanad_voice")
|
|
||||||
_MIC = _cfg_section("voice", "mic_udp")
|
|
||||||
_SP = _cfg_section("voice", "speaker")
|
|
||||||
_REC = _cfg_section("voice", "recording")
|
|
||||||
except Exception:
|
|
||||||
GEMINI_API_KEY, GEMINI_VOICE = "", "Charon"
|
|
||||||
SEND_SAMPLE_RATE, RECEIVE_SAMPLE_RATE, CHUNK_SIZE = 16000, 24000, 512
|
|
||||||
_SV, _MIC, _SP, _REC = {}, {}, {}, {}
|
|
||||||
|
|
||||||
API_KEY = GEMINI_API_KEY
|
# ─── CONFIG ──────────────────────────────────────────────
|
||||||
# Gemini Live model name (without "models/" prefix expected by google-genai SDK)
|
|
||||||
MODEL = os.environ.get("SANAD_GEMINI_MODEL",
|
|
||||||
"gemini-2.5-flash-native-audio-preview-12-2025")
|
|
||||||
VOICE_NAME = GEMINI_VOICE
|
|
||||||
|
|
||||||
SEND_RATE = SEND_SAMPLE_RATE
|
_REC = _cfg_section("voice", "recording")
|
||||||
RECEIVE_RATE = RECEIVE_SAMPLE_RATE
|
_SCRIPTS = _cfg_section("core", "script_files")
|
||||||
CHUNK_SAMPLES = CHUNK_SIZE
|
_GEMINI_DEFAULTS = _cfg_section("core", "gemini_defaults")
|
||||||
MIC_GAIN = _SV.get("mic_gain", 1.0)
|
|
||||||
|
|
||||||
PLAY_CHUNK = _SV.get("play_chunk_bytes", 96000)
|
_PERSONA_FILE = SCRIPTS_DIR / _SCRIPTS.get("persona", "sanad_script.txt")
|
||||||
SILENCE_PCM = b'\x00' * (CHUNK_SAMPLES * 2)
|
|
||||||
|
|
||||||
# ─── RECORDING ───────────────────────────────────────────
|
RECORD_ENABLED = os.environ.get(
|
||||||
RECORD_ENABLED = os.environ.get("SANAD_RECORD",
|
"SANAD_RECORD",
|
||||||
"1" if _REC.get("enabled", True) else "0") != "0"
|
"1" if _REC.get("enabled", True) else "0",
|
||||||
_rec_dir_rel = _REC.get("dir_relative", "data/recordings")
|
) != "0"
|
||||||
RECORD_DIR = Path(
|
_REC_DIR_REL = _REC.get("dir_relative", "data/recordings")
|
||||||
os.environ.get(
|
RECORD_DIR = Path(os.environ.get(
|
||||||
"SANAD_RECORD_DIR",
|
"SANAD_RECORD_DIR",
|
||||||
str(Path(__file__).resolve().parent.parent / _rec_dir_rel),
|
str(Path(__file__).resolve().parent.parent / _REC_DIR_REL),
|
||||||
)
|
))
|
||||||
)
|
|
||||||
|
|
||||||
SYSTEM_PROMPT = (
|
_FALLBACK_SYSTEM_PROMPT = (
|
||||||
"You are Marcus, a bilingual humanoid robot assistant made by YS Lootah Technology, Dubai, UAE. "
|
"You are Marcus, a bilingual humanoid robot assistant made by YS Lootah "
|
||||||
"RESPOND IN ARABIC (Gulf/Emirati dialect) OR ENGLISH ONLY. "
|
"Technology, Dubai, UAE. RESPOND IN ARABIC (Gulf/Emirati dialect) OR "
|
||||||
"YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE USER SPEAKS. "
|
"ENGLISH ONLY. YOU MUST RESPOND UNMISTAKABLY IN THE SAME LANGUAGE THE "
|
||||||
"If the user speaks Arabic, you MUST reply in Arabic Gulf dialect. "
|
"USER SPEAKS. If the user speaks Arabic, you MUST reply in Arabic Gulf "
|
||||||
"If the user speaks English, you MUST reply in English. "
|
"dialect. If the user speaks English, you MUST reply in English. Do NOT "
|
||||||
"Do NOT confuse Arabic with Japanese, Hindi, Russian, or any other language. "
|
"confuse Arabic with Japanese, Hindi, Russian, or any other language. "
|
||||||
"The user is speaking Arabic or English — nothing else. "
|
"The user is speaking Arabic or English — nothing else. Be concise — 1 "
|
||||||
"Be concise — 1 to 2 sentences max. Be friendly and natural. "
|
"to 2 sentences max. Be friendly and natural. If the user interrupts "
|
||||||
"If the user interrupts and says 'continue' or 'كمل', resume EXACTLY where you stopped. "
|
"and says 'continue' or 'كمل', resume EXACTLY where you stopped. Only "
|
||||||
"Only respond to clear human speech. Ignore background noise and silence completely. "
|
"respond to clear human speech. Ignore background noise and silence "
|
||||||
"Do not respond to sounds that are not words."
|
"completely. Do not respond to sounds that are not words."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# ─── HELPERS ─────────────────────────────────────────────
|
def _load_system_prompt() -> str:
|
||||||
|
"""scripts/sanad_script.txt → config default → hardcoded fallback."""
|
||||||
|
try:
|
||||||
|
text = _PERSONA_FILE.read_text(encoding="utf-8-sig").strip()
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
return _GEMINI_DEFAULTS.get("default_system_prompt", _FALLBACK_SYSTEM_PROMPT)
|
||||||
|
|
||||||
def audio_energy(pcm: bytes) -> int:
|
|
||||||
|
def _audio_energy(pcm: bytes) -> int:
|
||||||
try:
|
try:
|
||||||
samples = array.array("h", pcm)
|
samples = array.array("h", pcm)
|
||||||
return sum(abs(s) for s in samples) // len(samples) if samples else 0
|
return sum(abs(s) for s in samples) // len(samples) if samples else 0
|
||||||
@ -132,20 +126,22 @@ def audio_energy(pcm: bytes) -> int:
|
|||||||
# ─── TURN RECORDER ──────────────────────────────────────
|
# ─── TURN RECORDER ──────────────────────────────────────
|
||||||
|
|
||||||
class TurnRecorder:
|
class TurnRecorder:
|
||||||
"""Saves each turn as two WAV files: user mic + Gemini output.
|
"""Saves each turn as two WAV files: user mic + model output.
|
||||||
|
|
||||||
A turn starts when user audio starts flowing through `capture_user`
|
A turn starts when user audio starts flowing through `capture_user`
|
||||||
and ends on `turn_complete`. Files land in SANAD_RECORD_DIR as
|
and ends on `finish_turn`. Files land in `RECORD_DIR` as
|
||||||
`<timestamp>_user.wav` (16 kHz) and `<timestamp>_robot.wav` (24 kHz).
|
`<timestamp>_user.wav` (at `user_rate`) and `<timestamp>_robot.wav`
|
||||||
|
(at `robot_rate`). An `index.json` in the same directory tracks
|
||||||
An `index.json` maintains a list of all turns with metadata
|
every turn with timestamp + transcripts + durations for the dashboard.
|
||||||
(timestamp, text transcripts, durations) so the dashboard can
|
|
||||||
browse them later.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR):
|
def __init__(self, enabled: bool = True, out_dir: Path = RECORD_DIR,
|
||||||
|
user_rate: int = SEND_SAMPLE_RATE,
|
||||||
|
robot_rate: int = RECEIVE_SAMPLE_RATE):
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
self.out_dir = out_dir
|
self.out_dir = out_dir
|
||||||
|
self.user_rate = user_rate
|
||||||
|
self.robot_rate = robot_rate
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
self.out_dir.mkdir(parents=True, exist_ok=True)
|
self.out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
self._lock = threading.Lock()
|
self._lock = threading.Lock()
|
||||||
@ -182,7 +178,6 @@ class TurnRecorder:
|
|||||||
self._robot_text = (self._robot_text + " " + text).strip()
|
self._robot_text = (self._robot_text + " " + text).strip()
|
||||||
|
|
||||||
def finish_turn(self) -> dict:
|
def finish_turn(self) -> dict:
|
||||||
"""Save current buffers to disk, reset state, return metadata."""
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
return {}
|
return {}
|
||||||
with self._lock:
|
with self._lock:
|
||||||
@ -204,15 +199,17 @@ class TurnRecorder:
|
|||||||
"user_text": user_text, "robot_text": robot_text}
|
"user_text": user_text, "robot_text": robot_text}
|
||||||
try:
|
try:
|
||||||
if user_data:
|
if user_data:
|
||||||
user_path = self.out_dir / f"{stamp}_user.wav"
|
p = self.out_dir / f"{stamp}_user.wav"
|
||||||
self._save_wav(user_path, user_data, SEND_RATE)
|
self._save_wav(p, user_data, self.user_rate)
|
||||||
entry["user_wav"] = str(user_path)
|
entry["user_wav"] = str(p)
|
||||||
entry["user_duration_sec"] = round(len(user_data) / (SEND_RATE * 2), 3)
|
entry["user_duration_sec"] = round(
|
||||||
|
len(user_data) / (self.user_rate * 2), 3)
|
||||||
if robot_data:
|
if robot_data:
|
||||||
robot_path = self.out_dir / f"{stamp}_robot.wav"
|
p = self.out_dir / f"{stamp}_robot.wav"
|
||||||
self._save_wav(robot_path, robot_data, RECEIVE_RATE)
|
self._save_wav(p, robot_data, self.robot_rate)
|
||||||
entry["robot_wav"] = str(robot_path)
|
entry["robot_wav"] = str(p)
|
||||||
entry["robot_duration_sec"] = round(len(robot_data) / (RECEIVE_RATE * 2), 3)
|
entry["robot_duration_sec"] = round(
|
||||||
|
len(robot_data) / (self.robot_rate * 2), 3)
|
||||||
self._append_index(entry)
|
self._append_index(entry)
|
||||||
log.info("recorded turn → %s (user %.1fs, robot %.1fs)",
|
log.info("recorded turn → %s (user %.1fs, robot %.1fs)",
|
||||||
stamp,
|
stamp,
|
||||||
@ -222,7 +219,8 @@ class TurnRecorder:
|
|||||||
log.warning("recording save failed: %s", exc)
|
log.warning("recording save failed: %s", exc)
|
||||||
return entry
|
return entry
|
||||||
|
|
||||||
def _save_wav(self, path: Path, pcm: bytes, rate: int) -> None:
|
@staticmethod
|
||||||
|
def _save_wav(path: Path, pcm: bytes, rate: int) -> None:
|
||||||
with wave.open(str(path), "wb") as wf:
|
with wave.open(str(path), "wb") as wf:
|
||||||
wf.setnchannels(1)
|
wf.setnchannels(1)
|
||||||
wf.setsampwidth(2)
|
wf.setsampwidth(2)
|
||||||
@ -242,307 +240,40 @@ class TurnRecorder:
|
|||||||
payload = {"records": []}
|
payload = {"records": []}
|
||||||
payload.setdefault("records", []).append(entry)
|
payload.setdefault("records", []).append(entry)
|
||||||
payload["total_records"] = len(payload["records"])
|
payload["total_records"] = len(payload["records"])
|
||||||
idx_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False),
|
idx_path.write_text(
|
||||||
encoding="utf-8")
|
json.dumps(payload, indent=2, ensure_ascii=False),
|
||||||
|
encoding="utf-8",
|
||||||
|
|
||||||
# Mic + speaker classes now live in voice/audio_io.py — built via
|
|
||||||
# AudioIO.from_profile() in main(). Selected with SANAD_AUDIO_PROFILE
|
|
||||||
# (builtin | anker | hollyland_builtin).
|
|
||||||
|
|
||||||
|
|
||||||
# ─── SESSION ─────────────────────────────────────────────
|
|
||||||
|
|
||||||
async def run_session(mic: Mic, speaker: Speaker, voice: str):
|
|
||||||
client = genai.Client(api_key=API_KEY)
|
|
||||||
recorder = TurnRecorder(enabled=RECORD_ENABLED)
|
|
||||||
if RECORD_ENABLED:
|
|
||||||
log.info("recording enabled → %s", RECORD_DIR)
|
|
||||||
|
|
||||||
config = types.LiveConnectConfig(
|
|
||||||
response_modalities=["AUDIO"],
|
|
||||||
speech_config=types.SpeechConfig(
|
|
||||||
voice_config=types.VoiceConfig(
|
|
||||||
prebuilt_voice_config=types.PrebuiltVoiceConfig(
|
|
||||||
voice_name=voice
|
|
||||||
)
|
|
||||||
)
|
|
||||||
),
|
|
||||||
realtime_input_config=types.RealtimeInputConfig(
|
|
||||||
automatic_activity_detection=types.AutomaticActivityDetection(
|
|
||||||
disabled=False,
|
|
||||||
start_of_speech_sensitivity=getattr(
|
|
||||||
types.StartSensitivity,
|
|
||||||
_cfg_section("voice", "vad").get(
|
|
||||||
"start_sensitivity", "START_SENSITIVITY_HIGH")),
|
|
||||||
end_of_speech_sensitivity=getattr(
|
|
||||||
types.EndSensitivity,
|
|
||||||
_cfg_section("voice", "vad").get(
|
|
||||||
"end_sensitivity", "END_SENSITIVITY_LOW")),
|
|
||||||
prefix_padding_ms=_cfg_section("voice", "vad").get("prefix_padding_ms", 20),
|
|
||||||
silence_duration_ms=_cfg_section("voice", "vad").get("silence_duration_ms", 200),
|
|
||||||
)
|
|
||||||
),
|
|
||||||
input_audio_transcription=types.AudioTranscriptionConfig(),
|
|
||||||
output_audio_transcription=types.AudioTranscriptionConfig(),
|
|
||||||
system_instruction=types.Content(
|
|
||||||
parts=[types.Part(text=SYSTEM_PROMPT)]
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
session_num = 0
|
|
||||||
start_time = time.time()
|
|
||||||
consecutive_errors = 0
|
|
||||||
|
|
||||||
while True:
|
# ─── BRAIN FACTORY ───────────────────────────────────────
|
||||||
session_num += 1
|
|
||||||
speaking = False
|
|
||||||
stream_started = False
|
|
||||||
barge_block_until = 0.0
|
|
||||||
ai_speak_start = 0.0
|
|
||||||
last_ai_audio = 0.0
|
|
||||||
|
|
||||||
_bi = _cfg_section("voice", "barge_in")
|
def _build_brain(name: str, audio_io, recorder, voice: str, system_prompt: str):
|
||||||
BARGE_THRESHOLD = _bi.get("threshold", 500)
|
name = (name or "").strip().lower()
|
||||||
LOUD_CHUNKS_NEEDED = _bi.get("loud_chunks_needed", 3)
|
if name in ("", "gemini"):
|
||||||
BARGE_COOLDOWN = _bi.get("cooldown_sec", 0.3)
|
from Project.Sanad.gemini.script import GeminiBrain
|
||||||
ECHO_SUPPRESS_BELOW = _bi.get("echo_suppress_below", 500)
|
return GeminiBrain(audio_io, recorder, voice, system_prompt)
|
||||||
AI_SPEAK_GRACE_SEC = _bi.get("ai_speak_grace_sec", 0.15)
|
if name == "local":
|
||||||
|
from Project.Sanad.local.script import LocalBrain
|
||||||
uptime_min = (time.time() - start_time) / 60
|
return LocalBrain(audio_io, recorder, voice, system_prompt)
|
||||||
|
if name == "model":
|
||||||
try:
|
from Project.Sanad.voice.model_script import ModelBrain
|
||||||
log.info("connecting to Gemini (session #%d, uptime %.0fm)...",
|
return ModelBrain(audio_io, recorder, voice, system_prompt)
|
||||||
session_num, uptime_min)
|
# To add a provider: import the module and return its brain class here.
|
||||||
async with client.aio.live.connect(model=MODEL, config=config) as session:
|
raise ValueError(f"unknown voice brain: {name!r}")
|
||||||
log.info("connected — speak anytime!")
|
|
||||||
consecutive_errors = 0 # reset on successful connect
|
|
||||||
mic.flush()
|
|
||||||
done = asyncio.Event()
|
|
||||||
|
|
||||||
# ── Send mic ──
|
|
||||||
async def send_mic():
|
|
||||||
nonlocal speaking, barge_block_until
|
|
||||||
chunk_bytes = CHUNK_SAMPLES * 2
|
|
||||||
loud_count = 0
|
|
||||||
last_activity = time.time()
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
|
|
||||||
while not done.is_set():
|
|
||||||
try:
|
|
||||||
raw = await loop.run_in_executor(
|
|
||||||
None, lambda: mic.read_chunk(chunk_bytes))
|
|
||||||
except Exception:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Amplify
|
|
||||||
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
|
|
||||||
samples = np.clip(samples * MIC_GAIN, -32768, 32767).astype(np.int16)
|
|
||||||
data = samples.tobytes()
|
|
||||||
energy = audio_energy(data)
|
|
||||||
now = time.time()
|
|
||||||
|
|
||||||
# Barge-in
|
|
||||||
if speaking and now >= barge_block_until:
|
|
||||||
if (now - ai_speak_start) >= AI_SPEAK_GRACE_SEC:
|
|
||||||
if energy > BARGE_THRESHOLD:
|
|
||||||
loud_count += 1
|
|
||||||
else:
|
|
||||||
loud_count = max(0, loud_count - 1)
|
|
||||||
if loud_count > LOUD_CHUNKS_NEEDED:
|
|
||||||
log.info("BARGE-IN (e=%d)", energy)
|
|
||||||
do_interrupt("barge-in")
|
|
||||||
loud_count = 0
|
|
||||||
barge_block_until = now + BARGE_COOLDOWN
|
|
||||||
|
|
||||||
# Echo suppression
|
|
||||||
send_data = data
|
|
||||||
if speaking and energy < ECHO_SUPPRESS_BELOW:
|
|
||||||
send_data = SILENCE_PCM[:chunk_bytes]
|
|
||||||
|
|
||||||
# Record user audio (only when clearly speaking,
|
|
||||||
# energy > 250 — skip ambient silence noise)
|
|
||||||
if energy > 250 and not speaking:
|
|
||||||
recorder.capture_user(data)
|
|
||||||
|
|
||||||
# Watchdog
|
|
||||||
if energy > 250:
|
|
||||||
last_activity = now
|
|
||||||
elif now - last_activity > 10:
|
|
||||||
log.info("alive (no speech %.0fs, e=%d, buf=%d)",
|
|
||||||
now - last_activity, energy, len(mic._buf))
|
|
||||||
last_activity = now
|
|
||||||
|
|
||||||
try:
|
|
||||||
await session.send_realtime_input(
|
|
||||||
audio=types.Blob(
|
|
||||||
data=send_data,
|
|
||||||
mime_type=f"audio/pcm;rate={SEND_RATE}"
|
|
||||||
)
|
|
||||||
)
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
return
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("mic send failed: %s — ending session", e)
|
|
||||||
done.set()
|
|
||||||
return
|
|
||||||
|
|
||||||
await asyncio.sleep(CHUNK_SAMPLES / SEND_RATE)
|
|
||||||
|
|
||||||
log.info("send_mic task ended")
|
|
||||||
|
|
||||||
# ── Interrupt helper ──
|
|
||||||
def do_interrupt(source="local"):
|
|
||||||
nonlocal speaking, stream_started
|
|
||||||
speaking = False
|
|
||||||
stream_started = False
|
|
||||||
speaker.stop()
|
|
||||||
mic.flush()
|
|
||||||
recorder.finish_turn()
|
|
||||||
|
|
||||||
# ── Receive ──
|
|
||||||
async def receive():
|
|
||||||
nonlocal speaking, stream_started
|
|
||||||
nonlocal ai_speak_start, last_ai_audio
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
|
|
||||||
try:
|
|
||||||
last_recv = time.time()
|
|
||||||
while not done.is_set():
|
|
||||||
async for response in session.receive():
|
|
||||||
last_recv = time.time()
|
|
||||||
if done.is_set():
|
|
||||||
break
|
|
||||||
|
|
||||||
# Server going away — reconnect soon
|
|
||||||
if hasattr(response, 'go_away') and response.go_away is not None:
|
|
||||||
log.info("server going away — will reconnect")
|
|
||||||
done.set()
|
|
||||||
return
|
|
||||||
|
|
||||||
sc = response.server_content
|
|
||||||
if sc is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Gemini interrupted
|
|
||||||
if sc.interrupted is True:
|
|
||||||
if speaking:
|
|
||||||
log.info("Gemini interrupted")
|
|
||||||
do_interrupt("gemini")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# User transcript
|
|
||||||
if sc.input_transcription:
|
|
||||||
text = (sc.input_transcription.text or "").strip()
|
|
||||||
if text and not speaking:
|
|
||||||
log.info("USER: %s", text)
|
|
||||||
recorder.add_user_text(text)
|
|
||||||
|
|
||||||
# Marcus transcript
|
|
||||||
if sc.output_transcription:
|
|
||||||
text = (sc.output_transcription.text or "").strip()
|
|
||||||
if text:
|
|
||||||
log.info("MARCUS: %s", text)
|
|
||||||
recorder.add_robot_text(text)
|
|
||||||
|
|
||||||
# AI audio
|
|
||||||
if sc.model_turn:
|
|
||||||
for part in sc.model_turn.parts:
|
|
||||||
if part.inline_data and part.inline_data.data:
|
|
||||||
now = time.time()
|
|
||||||
if not speaking:
|
|
||||||
ai_speak_start = now
|
|
||||||
speaking = True
|
|
||||||
last_ai_audio = now
|
|
||||||
raw_audio = part.inline_data.data
|
|
||||||
recorder.capture_robot(raw_audio)
|
|
||||||
audio = np.frombuffer(
|
|
||||||
raw_audio, dtype=np.int16)
|
|
||||||
if not stream_started:
|
|
||||||
await loop.run_in_executor(
|
|
||||||
None, speaker.begin_stream)
|
|
||||||
stream_started = True
|
|
||||||
await loop.run_in_executor(
|
|
||||||
None, speaker.send_chunk,
|
|
||||||
audio, RECEIVE_RATE)
|
|
||||||
|
|
||||||
# Turn complete
|
|
||||||
if sc.turn_complete:
|
|
||||||
if speaking and stream_started and not speaker.interrupted:
|
|
||||||
dur = speaker.total_sent_sec
|
|
||||||
log.info("speaker %.1fs", dur)
|
|
||||||
await loop.run_in_executor(
|
|
||||||
None, speaker.wait_finish)
|
|
||||||
elif speaking and speaker.interrupted:
|
|
||||||
log.info("speaker interrupted")
|
|
||||||
speaking = False
|
|
||||||
stream_started = False
|
|
||||||
mic.flush()
|
|
||||||
recorder.finish_turn()
|
|
||||||
log.info("listening")
|
|
||||||
|
|
||||||
# receive() iterator ended — check if session is still alive
|
|
||||||
if time.time() - last_recv > 30:
|
|
||||||
log.warning("no messages from Gemini for 30s — session dead")
|
|
||||||
break
|
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
log.warning("receive ended: %s", e)
|
|
||||||
finally:
|
|
||||||
done.set()
|
|
||||||
|
|
||||||
try:
|
|
||||||
await asyncio.wait_for(
|
|
||||||
asyncio.gather(send_mic(), receive()),
|
|
||||||
timeout=_SV.get("session_timeout_sec", 660), # 11 min max (server go_away at ~10 min)
|
|
||||||
)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
log.warning("session timed out after 11 min")
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
log.warning("session cancelled")
|
|
||||||
|
|
||||||
log.info("session #%d ended — reconnecting in 1s", session_num)
|
|
||||||
speaker.stop()
|
|
||||||
mic.flush()
|
|
||||||
await asyncio.sleep(1)
|
|
||||||
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
log.info("cancelled — stopping")
|
|
||||||
break
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
log.info("keyboard interrupt — stopping")
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
consecutive_errors += 1
|
|
||||||
# Exponential backoff: 2s, 4s, 8s, 16s, max 30s
|
|
||||||
delay = min(30, 2 ** consecutive_errors)
|
|
||||||
log.error("session error (#%d): %s — reconnecting in %ds",
|
|
||||||
consecutive_errors, e, delay)
|
|
||||||
await asyncio.sleep(delay)
|
|
||||||
|
|
||||||
# After 10 consecutive errors, restart the client
|
|
||||||
if consecutive_errors >= 10:
|
|
||||||
log.warning("10 consecutive errors — recreating client")
|
|
||||||
try:
|
|
||||||
client = genai.Client(api_key=API_KEY)
|
|
||||||
consecutive_errors = 0
|
|
||||||
except Exception as ce:
|
|
||||||
log.error("client recreation failed: %s", ce)
|
|
||||||
|
|
||||||
|
|
||||||
# ─── MAIN ────────────────────────────────────────────────
|
# ─── MAIN ────────────────────────────────────────────────
|
||||||
|
|
||||||
def main():
|
def main() -> None:
|
||||||
if len(sys.argv) < 2:
|
if len(sys.argv) < 2:
|
||||||
print(__doc__)
|
print(__doc__)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
iface = sys.argv[1]
|
iface = sys.argv[1]
|
||||||
voice = VOICE_NAME
|
voice = GEMINI_VOICE
|
||||||
if "--voice" in sys.argv:
|
if "--voice" in sys.argv:
|
||||||
idx = sys.argv.index("--voice")
|
voice = sys.argv[sys.argv.index("--voice") + 1]
|
||||||
voice = sys.argv[idx + 1]
|
|
||||||
|
|
||||||
log.info("DDS on %s", iface)
|
log.info("DDS on %s", iface)
|
||||||
ChannelFactoryInitialize(0, iface)
|
ChannelFactoryInitialize(0, iface)
|
||||||
@ -554,27 +285,39 @@ def main():
|
|||||||
profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin")
|
profile = os.environ.get("SANAD_AUDIO_PROFILE", "builtin")
|
||||||
audio = AudioIO.from_profile(profile, audio_client=ac)
|
audio = AudioIO.from_profile(profile, audio_client=ac)
|
||||||
audio.start()
|
audio.start()
|
||||||
mic, speaker = audio.mic, audio.speaker
|
|
||||||
log.info("audio profile=%s", audio.profile_id)
|
log.info("audio profile=%s", audio.profile_id)
|
||||||
|
|
||||||
|
# Sanity-check the mic before handing it to the brain
|
||||||
log.info("testing mic 2s...")
|
log.info("testing mic 2s...")
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
test = mic.read_chunk(1024)
|
test = audio.mic.read_chunk(1024)
|
||||||
e = audio_energy(test)
|
e = _audio_energy(test)
|
||||||
log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT")
|
log.info("mic energy=%d %s", e, "OK" if e > 0 else "SILENT")
|
||||||
|
|
||||||
log.info("voice=%s log=%s", voice, LOG_FILE)
|
recorder = TurnRecorder(enabled=RECORD_ENABLED)
|
||||||
|
if RECORD_ENABLED:
|
||||||
|
log.info("recording enabled → %s", RECORD_DIR)
|
||||||
|
|
||||||
|
system_prompt = _load_system_prompt()
|
||||||
|
brain_name = os.environ.get("SANAD_VOICE_BRAIN", "gemini")
|
||||||
|
brain = _build_brain(brain_name, audio, recorder, voice, system_prompt)
|
||||||
|
log.info("voice brain=%s voice=%s log=%s", brain_name, voice, LOG_FILE)
|
||||||
log.info("─" * 50)
|
log.info("─" * 50)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
asyncio.run(run_session(mic, speaker, voice))
|
asyncio.run(brain.run())
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
pass
|
pass
|
||||||
except Exception as e:
|
except Exception as exc:
|
||||||
log.error("fatal: %s", e)
|
log.error("fatal: %s", exc)
|
||||||
finally:
|
finally:
|
||||||
log.info("stopped")
|
log.info("stopping")
|
||||||
|
try:
|
||||||
|
brain.stop()
|
||||||
|
except Exception:
|
||||||
|
log.warning("brain.stop() failed", exc_info=True)
|
||||||
audio.stop()
|
audio.stop()
|
||||||
|
log.info("stopped")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user