Marcus/Config/config_Voice.json

123 lines
7.2 KiB
JSON

{
"tts": {
"_comment": "G1 TtsMaker — used by API/audio_api.py::speak() for non-Gemini utterances from other Marcus subsystems (e.g. brain fallback announcements). Gemini owns its own voice via gemini_brain; this section does not affect the Gemini path.",
"backend": "builtin_ttsmaker",
"builtin_speaker_id": 2,
"target_sample_rate": 16000
},
"stt": {
"_comment": "Voice pipeline: Gemini Live STT (text-mode) → Marcus brain → TtsMaker. Gemini transcribes the user's speech with server-side VAD; Marcus's brain (Brain/marcus_brain.py) decides the reply and speaks it via AudioAPI.speak → TtsMaker. No audio comes back from Gemini (response_modalities=['TEXT']). Install on Jetson: `pip install google-genai`. API key: env MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY fallback).",
"_gemini_comment": "Gemini Live STT-only settings. The actual Gemini WebSocket runs in a SEPARATE Python 3.10+ subprocess (Voice/gemini_runner.py) because google-genai requires Python ≥3.9 and marcus is pinned to Python 3.8 by the NVIDIA Jetson torch wheel. The marcus parent process spawns `gemini_python_path -u Voice/gemini_runner.py` and parses the JSON-line transcripts on stdout. Env overrides: MARCUS_GEMINI_API_KEY / MARCUS_GEMINI_MODEL / MARCUS_GEMINI_PYTHON.",
"_gemini_python_path_comment": "Path to a Python 3.10+ binary that has `google-genai` installed (typically a separate conda env, e.g. `gemini_sdk` on this Jetson). Leave empty to auto-detect — the manager tries ~/miniconda3/envs/gemini_sdk/bin/python and a few common alternates. Override at runtime via env MARCUS_GEMINI_PYTHON.",
"gemini_python_path": "",
"gemini_api_key": "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8",
"gemini_model": "gemini-2.5-flash-native-audio-preview-12-2025",
"gemini_voice_name": "Charon",
"gemini_audio_profile": "builtin",
"gemini_chunk_size": 512,
"gemini_send_sample_rate": 16000,
"gemini_record_enabled": true,
"_gemini_system_prompt_comment": "Marcus brain is the authoritative reply path; Gemini is just an ear here. Keep the prompt short — it tells Gemini to transcribe, not to chat. Override by pointing gemini_system_prompt_file at a text file (relative paths resolve from PROJECT_ROOT).",
"gemini_system_prompt_file": "",
"gemini_system_prompt": "You are Sanad's ear. Your only job is to transcribe what the user says to Sanad, the humanoid robot. Do not respond conversationally. Do not speculate. Do not invent dialogue. If the user addresses Sanad, return exactly what they said. Stay completely silent in your response.",
"_gemini_vad_comment": "Gemini server-side VAD tuning. start_sensitivity/end_sensitivity accept 'START_SENSITIVITY_HIGH|LOW' and 'END_SENSITIVITY_HIGH|LOW'. HIGH start = eagerly treats any speech-like sound as turn start, LOW = more conservative. LOW end = longer patience before ending a turn, HIGH = cuts turn sooner. prefix_padding_ms preserves audio from just before speech is detected. silence_duration_ms is how long of quiet ends a turn.",
"gemini_vad_start_sensitivity": "START_SENSITIVITY_HIGH",
"gemini_vad_end_sensitivity": "END_SENSITIVITY_LOW",
"gemini_vad_prefix_padding_ms": 20,
"gemini_vad_silence_duration_ms": 200,
"_gemini_session_comment": "Reconnect / error-handling knobs. session_timeout_sec matches Gemini Live's max session (~11 min). After max_consecutive_errors failures the client is recreated; no_messages_timeout_sec catches dead sessions that stop emitting.",
"gemini_session_timeout_sec": 660,
"gemini_max_reconnect_delay_sec": 30,
"gemini_max_consecutive_errors": 10,
"gemini_no_messages_timeout_sec": 30,
"mic_gain": 1.0,
"_dispatch_comment": "Motion command dispatch side-channel. Marcus listens to Gemini's input_transcription; if the text contains a wake-word variant AND the remainder fuzzy-matches a canonical phrase in command_vocab at >= command_vocab_cutoff, Marcus fires on_command() in parallel to Gemini's verbal reply. Dedup on the canonical form within command_cooldown_sec prevents streaming partials from double-firing.",
"command_vocab_cutoff": 0.72,
"command_cooldown_sec": 1.5,
"min_transcription_length": 3,
"_vocab_comment": "wake_words = variants Gemini may produce for 'Sanad' — word-boundary matched in the user transcript. command_vocab = canonical command phrases. The dispatcher fuzzy-matches the transcript (after wake-word strip) against command_vocab. garbage_patterns lists short noise phrases Gemini sometimes emits — rejected before fuzzy-match unless they happen to equal a vocab entry exactly. Edit these to add new vocabulary — NO code change required.",
"wake_words": [
"sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
"sanat", "sunnat", "sonnat", "sinnat", "sennat",
"sanid", "sanud", "saned", "sanod", "sanaad",
"senad", "sinad", "sonad", "sunad",
"sanah", "sanath", "sanadh", "sonadh",
"samad", "somad", "sumad",
"thanad", "zanad",
"sa nad", "san ad", "san odd", "san add"
],
"command_vocab": [
"what do you see", "what can you see", "look around",
"come to me", "come here", "come back", "come closer",
"approach", "get closer", "come",
"go home", "go back", "go forward", "go backward",
"go left", "go right", "go",
"sit down", "stand up", "sit", "stand",
"raise arm", "lower arm", "wave hello", "wave", "point",
"turn left", "turn right", "turn around",
"move forward", "move backward", "move back",
"move left", "move right",
"walk forward", "walk backward", "walk back",
"step forward", "step back", "step left", "step right",
"forward", "backward", "back", "left", "right",
"patrol", "stop", "halt", "wait", "pause", "freeze", "hold",
"hello", "hi", "hey", "help",
"who are you", "where are you", "where am i", "what is your name",
"remember this", "forget", "do it again", "repeat", "undo",
"follow me", "stay here"
],
"garbage_patterns": [
"thanks for watching", "thank you for watching",
"thank you", "thanks",
"bye", "goodbye",
".", "you", "yeah",
"okay", "ok",
"um", "uh", "hmm", "mm",
"i", "a"
]
},
"mic": {
"_comment": "Used by API/audio_api.py::record() for non-Gemini capture (e.g. ad-hoc recording commands from other subsystems). Gemini reads the mic via Voice/audio_io.py BuiltinMic directly.",
"backend": "builtin_udp",
"source_index": "3",
"format": "s16le",
"rate": 16000,
"channels": 1
},
"mic_udp": {
"_comment": "G1 on-board mic multicast parameters. Consumed by Voice/audio_io.py BuiltinMic.",
"group": "239.168.123.161",
"port": 5555,
"buffer_max_bytes": 64000,
"read_timeout_sec": 0.04
},
"speaker": {
"_comment": "G1 on-board speaker parameters. dds_interface is the robot's DDS NIC; app_name is the stream label used by AudioClient.PlayStream.",
"dds_interface": "eth0",
"volume": 100,
"app_name": "sanad",
"begin_stream_pause_sec": 0.15,
"wait_finish_margin_sec": 0.3
},
"audio": {
"data_dir": "Data/Voice/Recordings",
"log_file": "logs/voice.log"
},
"messages": {
"ready": "Voice system ready"
}
}