Marcus/Config/config_Voice.json

{
  "tts": {
    "_comment": "G1 TtsMaker — used by API/audio_api.py::speak() for non-Gemini utterances from other Marcus subsystems (e.g. brain fallback announcements). Gemini owns its own voice via gemini_brain; this section does not affect the Gemini path.",
    "backend": "builtin_ttsmaker",
    "builtin_speaker_id": 2,
    "target_sample_rate": 16000
  },

  "stt": {
    "_comment": "Voice pipeline: Gemini Live SPEECH-TO-SPEECH (Sanad pattern). Gemini hears the mic, sees camera frames streamed over from Marcus, and replies with its own voice through the G1 speaker. Marcus's brain still dispatches motion commands via a side channel — when the transcript matches 'Sanad + action', Marcus's command_parser fires the motion silently while Gemini speaks the verbal acknowledgement. The brain's `speak` reply is logged but NOT spoken (avoids double-audio collision with Gemini). Install on Jetson (gemini_sdk env): `pip install google-genai`. API key: env MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY fallback).",

    "_gemini_comment": "Gemini Live S2S settings. The actual Gemini WebSocket runs in a SEPARATE Python 3.10+ subprocess (Voice/gemini_runner.py) because google-genai requires Python ≥3.9 and marcus is pinned to Python 3.8 by the NVIDIA Jetson torch wheel. The runner ALSO owns the G1 speaker (unitree_sdk2py works in gemini_sdk env) so Gemini's audio plays directly without IPC. The marcus parent process forwards camera frames to the runner via stdin so Gemini can see what the robot sees. Env overrides: MARCUS_GEMINI_API_KEY / MARCUS_GEMINI_MODEL / MARCUS_GEMINI_VOICE / MARCUS_GEMINI_PYTHON.",
    "_gemini_python_path_comment": "Path to a Python 3.10+ binary that has `google-genai` installed (typically a separate conda env, e.g. `gemini_sdk` on this Jetson). Leave empty to auto-detect — the manager tries ~/miniconda3/envs/gemini_sdk/bin/python and a few common alternates. Override at runtime via env MARCUS_GEMINI_PYTHON.",
    "gemini_python_path": "",
    "gemini_api_key": "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8",
    "gemini_model": "gemini-2.5-flash-native-audio-preview-12-2025",
    "gemini_voice_name": "Charon",
    "gemini_audio_profile": "builtin",
    "gemini_chunk_size": 512,
    "gemini_send_sample_rate": 16000,
    "gemini_receive_sample_rate": 24000,
    "gemini_record_enabled": true,

    "_gemini_camera_comment": "Stream camera frames to Gemini Live so vision answers ('what do you see') are correct rather than hallucinated. Marcus parent grabs JPEG frames via API.camera_api.get_frame() at gemini_frame_interval_sec cadence and pipes them to the runner over stdin. Frame_max_age_sec drops stale frames. Set gemini_send_frames=false to disable (saves API tokens but breaks vision questions).",
    "gemini_send_frames": true,
    "gemini_frame_interval_sec": 0.5,
    "gemini_frame_max_age_sec": 1.5,

    "_gemini_barge_comment": "Barge-in = user speaking over Gemini. Three loud chunks above barge_threshold interrupts Gemini mid-sentence. echo_suppress_below masks mic frames quieter than the threshold during playback so the mic doesn't re-feed Gemini its own voice. On the G1 the on-board speaker is loud enough that ECHO frames hit ~1500-3000 RMS, well above the 500 barge threshold — that's why earlier sessions saw self-interrupt loops. Tuned values: threshold 3500 (only a real shout cuts Gemini off), echo_suppress_below 3500 (mute everything below that during AI playback — anything quieter than the speaker's own echo is treated as silence). ai_speak_grace_sec 0.5 gives Gemini a half-second runway before barge can fire. If you find users genuinely can't interrupt Gemini, drop barge_threshold to ~2500 and accept some self-interrupts.",
    "gemini_barge_threshold": 3500,
    "gemini_barge_loud_chunks_needed": 5,
    "gemini_barge_cooldown_sec": 0.5,
    "gemini_echo_suppress_below": 3500,
    "gemini_ai_speak_grace_sec": 0.5,
    "gemini_begin_stream_pause_sec": 0.15,
    "gemini_wait_finish_margin_sec": 0.3,

    "_gemini_system_prompt_comment": "Persona for Gemini Live's spoken reply. Gemini owns the voice in this architecture, so make this prompt match the experience you want users to hear. The robot's body is controlled by Marcus's brain via a side channel — Gemini doesn't need to invoke motions itself, just acknowledge them naturally. Override by pointing gemini_system_prompt_file at a text file (relative paths resolve from PROJECT_ROOT).",
    "gemini_system_prompt_file": "",
    "gemini_system_prompt": "You are Sanad (سند), a friendly humanoid robot assistant made by YS Lootah Technology in Dubai. Your body is a Unitree G1 humanoid. You can see the user through your camera and talk to them in real time. You speak both English and Arabic naturally — match the user's language in your reply. Reply briefly, usually one or two sentences. When the user asks 'what do you see' / 'ماذا ترى' or describes the scene, look at the camera frames you're receiving and answer accurately based on what's actually there; do not invent details. CRITICAL ACTION RULE — physical motion only happens when the user addresses you by name 'Sanad' (English) or 'سند' (Arabic) AND gives an action. Examples: 'Sanad, turn right' → say 'Turning right.' 'سند، استدر يميناً' → say 'أستدير يميناً.' Plain conversation or vision queries WITHOUT 'Sanad' / 'سند' are fine but DO NOT trigger any motion confirmation — just chat or describe. NEVER say 'Turning' / 'Moving' / 'Sitting' / 'أستدير' / 'أتحرك' unless the user actually said 'Sanad' / 'سند' first. When you do say a motion confirmation, use the same language the user used. Motion verbs supported (English / Arabic): turn left/right (استدر يميناً/يساراً), turn around (استدر للخلف), move forward/back (تحرك للأمام/للخلف), sit down (اجلس), stand up (قف), wave hello (لوّح), raise/lower arm (ارفع/اخفض يدك), come here (تعال), follow me (اتبعني), stay here (ابق هنا), go home (اذهب للبيت), stop (توقف), patrol (طوف), look around (انظر حولك).",

    "_gemini_vad_comment": "Gemini server-side VAD tuning. start_sensitivity/end_sensitivity accept 'START_SENSITIVITY_HIGH|LOW' and 'END_SENSITIVITY_HIGH|LOW'. HIGH start = eagerly treats any speech-like sound as turn start, LOW = more conservative. LOW end = longer patience before ending a turn, HIGH = cuts turn sooner. prefix_padding_ms preserves audio from just before speech is detected. silence_duration_ms is how long of quiet ends a turn.",
    "gemini_vad_start_sensitivity": "START_SENSITIVITY_HIGH",
    "gemini_vad_end_sensitivity": "END_SENSITIVITY_LOW",
    "gemini_vad_prefix_padding_ms": 20,
    "gemini_vad_silence_duration_ms": 200,

    "_gemini_session_comment": "Reconnect / error-handling knobs. session_timeout_sec matches Gemini Live's max session (~11 min). After max_consecutive_errors failures the client is recreated; no_messages_timeout_sec catches dead sessions that stop emitting.",
    "gemini_session_timeout_sec": 660,
    "gemini_max_reconnect_delay_sec": 30,
    "gemini_max_consecutive_errors": 10,
    "gemini_no_messages_timeout_sec": 30,

    "mic_gain": 1.0,

    "_dispatch_comment": "Motion command dispatch side-channel. Marcus listens to Gemini's input_transcription; if the text contains a wake-word variant AND the remainder fuzzy-matches a canonical phrase in command_vocab at >= command_vocab_cutoff, Marcus fires on_command() in parallel to Gemini's verbal reply. Dedup on the canonical form within command_cooldown_sec prevents streaming partials from double-firing.",
    "command_vocab_cutoff": 0.72,
    "command_cooldown_sec": 1.5,
    "min_transcription_length": 3,

    "_vocab_comment": "wake_words and command_vocab now live in Config/instruction.json — single source of truth for all bilingual phrase tables (wake variants + per-action user_phrases + per-action bot_phrases, English AND Arabic). garbage_patterns stays here because it's noise filtering, not voice instruction.",
    "garbage_patterns": [
      "thanks for watching", "thank you for watching",
      "thank you", "thanks",
      "bye", "goodbye",
      ".", "you", "yeah",
      "okay", "ok",
      "um", "uh", "hmm", "mm",
      "i", "a"
    ]
  },

  "mic": {
    "_comment": "Used by API/audio_api.py::record() for non-Gemini capture (e.g. ad-hoc recording commands from other subsystems). Gemini reads the mic via Voice/audio_io.py BuiltinMic directly.",
    "backend": "builtin_udp",
    "source_index": "3",
    "format": "s16le",
    "rate": 16000,
    "channels": 1
  },

  "mic_udp": {
    "_comment": "G1 on-board mic multicast parameters. Consumed by Voice/audio_io.py BuiltinMic.",
    "group": "239.168.123.161",
    "port": 5555,
    "buffer_max_bytes": 64000,
    "read_timeout_sec": 0.04
  },

  "speaker": {
    "_comment": "G1 on-board speaker parameters. dds_interface is the robot's DDS NIC; app_name is the stream label used by AudioClient.PlayStream. volume is 0-100; lowered from 100 to 70 because the on-board mic picks up the on-board speaker's echo strongly enough to feed Gemini Live a self-loop at full volume — see the gemini_barge_in tunings.",
    "dds_interface": "eth0",
    "volume": 70,
    "app_name": "sanad",
    "begin_stream_pause_sec": 0.15,
    "wait_finish_margin_sec": 0.3
  },

  "audio": {
    "data_dir": "Data/Voice/Recordings",
    "log_file": "logs/voice.log"
  },

  "messages": {
    "ready": "Voice system ready"
  }
}