Sanad/config/local_config.json

93 lines
2.9 KiB
JSON

{
"_description": "Tunables for local/* — fully on-device voice pipeline (Silero VAD → Whisper → Qwen via llama.cpp → CosyVoice2). Loaded via core.config_loader.load('local').",
"subprocess": {
"_comment": "local/subprocess.py — LocalSubprocess supervisor. Mirrors gemini/subprocess.py. IMPORTANT: python_bin points at the `local` conda env (Python 3.8 + Jetson CUDA torch) so CosyVoice+Whisper run with GPU, while the dashboard/Gemini stack stays in gemini_sdk (Python 3.10).",
"python_bin": "/home/unitree/miniconda3/envs/local/bin/python",
"log_tail_size": 2000,
"transcript_tail_size": 30,
"log_name": "local_subprocess",
"stop_timeout_sec": 5.0,
"terminate_timeout_sec": 3.0,
"noisy_prefixes": [
"ALSA lib ",
"Expression 'alsa_",
"Cannot connect to server socket",
"jack server is not running"
],
"noisy_fragments": [
"Unknown PCM",
"Evaluate error",
"snd_pcm_open_noupdate",
"PaAlsaStream"
]
},
"vad": {
"_comment": "Silero VAD — CPU. Emits speech_start / speech_end events.",
"sample_rate": 16000,
"frame_ms": 32,
"threshold": 0.55,
"min_silence_ms": 400,
"min_speech_ms": 250,
"pad_start_ms": 200,
"pad_end_ms": 200,
"device": "cpu"
},
"stt": {
"_comment": "faster-whisper Large V3 Turbo, INT8 on GPU.",
"model_name": "large-v3-turbo",
"model_subdir": "faster-whisper-large-v3-turbo",
"device": "cuda",
"compute_type": "int8_float16",
"beam_size": 1,
"language": null,
"vad_filter": false,
"no_speech_threshold": 0.6,
"min_utterance_chars": 2,
"temperature": 0.0
},
"llm": {
"_comment": "Qwen 2.5 Instruct via Ollama (default) OR self-managed llama.cpp. Set backend to pick.",
"backend": "ollama",
"_ollama_comment": "Ollama daemon — assumes `ollama serve` is running; `ollama pull qwen2.5:1.5b` to fetch.",
"ollama_host": "127.0.0.1",
"ollama_port": 11434,
"ollama_model": "qwen2.5:1.5b",
"ollama_keep_alive": "5m",
"_llamacpp_comment": "Self-managed llama-server subprocess. Only used when backend='llama_cpp'.",
"model_subdir": "qwen2.5-1.5b-instruct-q4_k_m.gguf",
"server_binary": "llama-server",
"host": "127.0.0.1",
"port": 8080,
"n_gpu_layers": 99,
"ctx_size": 2048,
"threads": 4,
"startup_timeout_sec": 30,
"_shared_comment": "Generation params — both backends.",
"request_timeout_sec": 30,
"max_tokens": 200,
"temperature": 0.7,
"top_p": 0.9,
"stop": ["<|im_end|>", "\n\n\n"],
"chunk_delimiters": ".,?!؟،",
"chunk_min_chars": 8
},
"tts": {
"_comment": "CosyVoice2 0.5B streaming — GPU. Uses a 3s reference WAV for voice cloning.",
"model_subdir": "CosyVoice2-0.5B",
"reference_wav_subdir": "khaleeji_reference_3s.wav",
"reference_prompt": "",
"stream_chunk_sec": 0.25,
"sample_rate": 16000,
"queue_max": 3,
"device": "cuda"
}
}