Sanad/config/local_config.json

{
  "_description": "Tunables for local/* — fully on-device voice pipeline (Silero VAD → Whisper → Qwen via llama.cpp → CosyVoice2). Loaded via core.config_loader.load('local').",

  "subprocess": {
    "_comment": "local/subprocess.py — LocalSubprocess supervisor. Mirrors gemini/subprocess.py. IMPORTANT: python_bin points at the `local` conda env (Python 3.8 + Jetson CUDA torch) so CosyVoice+Whisper run with GPU, while the dashboard/Gemini stack stays in gemini_sdk (Python 3.10).",
    "python_bin": "/home/unitree/miniconda3/envs/local/bin/python",
    "log_tail_size": 2000,
    "transcript_tail_size": 30,
    "log_name": "local_subprocess",
    "stop_timeout_sec": 5.0,
    "terminate_timeout_sec": 3.0,
    "noisy_prefixes": [
      "ALSA lib ",
      "Expression 'alsa_",
      "Cannot connect to server socket",
      "jack server is not running"
    ],
    "noisy_fragments": [
      "Unknown PCM",
      "Evaluate error",
      "snd_pcm_open_noupdate",
      "PaAlsaStream"
    ]
  },

  "vad": {
    "_comment": "Silero VAD — CPU. Emits speech_start / speech_end events.",
    "sample_rate": 16000,
    "frame_ms": 32,
    "threshold": 0.55,
    "min_silence_ms": 400,
    "min_speech_ms": 250,
    "pad_start_ms": 200,
    "pad_end_ms": 200,
    "device": "cpu"
  },

  "stt": {
    "_comment": "faster-whisper Large V3 Turbo, INT8 on GPU.",
    "model_name": "large-v3-turbo",
    "model_subdir": "faster-whisper-large-v3-turbo",
    "device": "cuda",
    "compute_type": "int8_float16",
    "beam_size": 1,
    "language": null,
    "vad_filter": false,
    "no_speech_threshold": 0.6,
    "min_utterance_chars": 2,
    "temperature": 0.0
  },

  "llm": {
    "_comment": "Qwen 2.5 Instruct via Ollama (default) OR self-managed llama.cpp. Set backend to pick.",
    "backend": "ollama",

    "_ollama_comment": "Ollama daemon — assumes `ollama serve` is running; `ollama pull qwen2.5:1.5b` to fetch.",
    "ollama_host": "127.0.0.1",
    "ollama_port": 11434,
    "ollama_model": "qwen2.5:1.5b",
    "ollama_keep_alive": "5m",

    "_llamacpp_comment": "Self-managed llama-server subprocess. Only used when backend='llama_cpp'.",
    "model_subdir": "qwen2.5-1.5b-instruct-q4_k_m.gguf",
    "server_binary": "llama-server",
    "host": "127.0.0.1",
    "port": 8080,
    "n_gpu_layers": 99,
    "ctx_size": 2048,
    "threads": 4,
    "startup_timeout_sec": 30,

    "_shared_comment": "Generation params — both backends.",
    "request_timeout_sec": 30,
    "max_tokens": 200,
    "temperature": 0.7,
    "top_p": 0.9,
    "stop": ["<|im_end|>", "\n\n\n"],
    "chunk_delimiters": ".,?!؟،",
    "chunk_min_chars": 8
  },

  "tts": {
    "_comment": "CosyVoice2 0.5B streaming — GPU. Uses a 3s reference WAV for voice cloning.",
    "model_subdir": "CosyVoice2-0.5B",
    "reference_wav_subdir": "khaleeji_reference_3s.wav",
    "reference_prompt": "",
    "stream_chunk_sec": 0.25,
    "sample_rate": 16000,
    "queue_max": 3,
    "device": "cuda"
  }
}