Update 2026-04-27 09:39:12

This commit is contained in:
kassam 2026-04-27 09:39:13 +04:00
parent 9485601e18
commit 211d4f52ab
17 changed files with 2062 additions and 1855 deletions

View File

@ -214,7 +214,17 @@ def _init_voice():
if isinstance(result, dict): if isinstance(result, dict):
sp = (result.get("speak") or "").strip() sp = (result.get("speak") or "").strip()
if sp and _audio_api: if sp and _audio_api:
# Drop Gemini's mic buffer so the robot's own voice
# (picked up by the mic during TtsMaker playback)
# doesn't get transcribed and fed back as a new
# "user" utterance.
if _voice_module is not None:
try: _voice_module.flush_mic()
except Exception: pass
_audio_api.speak(sp) _audio_api.speak(sp)
if _voice_module is not None:
try: _voice_module.flush_mic()
except Exception: pass
# Redraw the Command: prompt that our print clobbered # Redraw the Command: prompt that our print clobbered
print("Command: ", end="", flush=True) print("Command: ", end="", flush=True)

View File

@ -1,47 +1,49 @@
{ {
"tts": { "tts": {
"_comment": "G1 TtsMaker — used by API/audio_api.py::speak() for non-Gemini utterances from other Marcus subsystems (e.g. brain fallback announcements). Gemini owns its own voice via gemini_brain; this section does not affect the Gemini path.",
"backend": "builtin_ttsmaker", "backend": "builtin_ttsmaker",
"builtin_speaker_id": 2, "builtin_speaker_id": 2,
"target_sample_rate": 16000 "target_sample_rate": 16000
}, },
"stt": { "stt": {
"_backend_comment": "'faster_whisper' (Whisper base.en int8 on CPU) or 'moonshine' (useful-sensors Moonshine via onnxruntime). Moonshine has a different error profile — worth trying when Whisper consistently mishears commands as short hallucinations like 'Yes.', 'Bye.', 'It.'. Moonshine requires `pip install moonshine-voice` on the Jetson.", "_comment": "Voice pipeline: Gemini Live STT (text-mode) → Marcus brain → TtsMaker. Gemini transcribes the user's speech with server-side VAD; Marcus's brain (Brain/marcus_brain.py) decides the reply and speaks it via AudioAPI.speak → TtsMaker. No audio comes back from Gemini (response_modalities=['TEXT']). Install on Jetson: `pip install google-genai`. API key: env MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY fallback).",
"backend": "moonshine",
"moonshine_language": "en",
"_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",
"_mode_comment": "Three modes. 'always_on_gated' (default, Sanad-style) = continuously transcribe+log every utterance to logs/transcript.log. Dispatch only when wake word is in the utterance. 'Sanad <cmd>' dispatches the cmd. Bare 'Sanad' → speak 'Yes' and treat the NEXT utterance as the command (within await_command_timeout_sec). 'wake_and_command' = classic acoustic wake first, then record a separate command (no always-listen overhead). 'always_on' = transcribe + dispatch everything, no gate (chatty).", "_gemini_comment": "Gemini Live STT-only settings. The actual Gemini WebSocket runs in a SEPARATE Python 3.10+ subprocess (Voice/gemini_runner.py) because google-genai requires Python ≥3.9 and marcus is pinned to Python 3.8 by the NVIDIA Jetson torch wheel. The marcus parent process spawns `gemini_python_path -u Voice/gemini_runner.py` and parses the JSON-line transcripts on stdout. Env overrides: MARCUS_GEMINI_API_KEY / MARCUS_GEMINI_MODEL / MARCUS_GEMINI_PYTHON.",
"mode": "wake_and_command", "_gemini_python_path_comment": "Path to a Python 3.10+ binary that has `google-genai` installed (typically a separate conda env, e.g. `gemini_sdk` on this Jetson). Leave empty to auto-detect — the manager tries ~/miniconda3/envs/gemini_sdk/bin/python and a few common alternates. Override at runtime via env MARCUS_GEMINI_PYTHON.",
"await_command_timeout_sec": 10.0, "gemini_python_path": "",
"gemini_api_key": "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8",
"gemini_model": "gemini-2.5-flash-native-audio-preview-12-2025",
"gemini_voice_name": "Charon",
"gemini_audio_profile": "builtin",
"gemini_chunk_size": 512,
"gemini_send_sample_rate": 16000,
"gemini_record_enabled": true,
"_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.", "_gemini_system_prompt_comment": "Marcus brain is the authoritative reply path; Gemini is just an ear here. Keep the prompt short — it tells Gemini to transcribe, not to chat. Override by pointing gemini_system_prompt_file at a text file (relative paths resolve from PROJECT_ROOT).",
"always_on_speech_entry_rms": 150.0, "gemini_system_prompt_file": "",
"always_on_silence_exit_rms": 70.0, "gemini_system_prompt": "You are Sanad's ear. Your only job is to transcribe what the user says to Sanad, the humanoid robot. Do not respond conversationally. Do not speculate. Do not invent dialogue. If the user addresses Sanad, return exactly what they said. Stay completely silent in your response.",
"always_on_silence_duration_sec": 0.8,
"always_on_min_utterance_sec": 0.3,
"always_on_max_utterance_sec": 12.0,
"always_on_idle_log_sec": 5.0,
"always_on_ambient_mult": 1.4,
"always_on_ambient_window_chunks": 100,
"_gemini_vad_comment": "Gemini server-side VAD tuning. start_sensitivity/end_sensitivity accept 'START_SENSITIVITY_HIGH|LOW' and 'END_SENSITIVITY_HIGH|LOW'. HIGH start = eagerly treats any speech-like sound as turn start, LOW = more conservative. LOW end = longer patience before ending a turn, HIGH = cuts turn sooner. prefix_padding_ms preserves audio from just before speech is detected. silence_duration_ms is how long of quiet ends a turn.",
"gemini_vad_start_sensitivity": "START_SENSITIVITY_HIGH",
"gemini_vad_end_sensitivity": "END_SENSITIVITY_LOW",
"gemini_vad_prefix_padding_ms": 20,
"gemini_vad_silence_duration_ms": 200,
"whisper_model": "base.en", "_gemini_session_comment": "Reconnect / error-handling knobs. session_timeout_sec matches Gemini Live's max session (~11 min). After max_consecutive_errors failures the client is recreated; no_messages_timeout_sec catches dead sessions that stop emitting.",
"whisper_device": "cpu", "gemini_session_timeout_sec": 660,
"whisper_compute_type": "int8", "gemini_max_reconnect_delay_sec": 30,
"gemini_max_consecutive_errors": 10,
"gemini_no_messages_timeout_sec": 30,
"_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.", "mic_gain": 1.0,
"mic_gain": 1.0,
"whisper_beam_size": 8,
"whisper_no_speech_threshold": 0.85,
"whisper_log_prob_threshold": -1.8,
"whisper_compression_ratio_threshold": 3.0,
"whisper_temperature_fallback": [0.0, 0.2, 0.4],
"_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
"_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
"_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
"whisper_initial_prompt": "Sanad. Turn left. Turn right. Move forward. Move back. Stop. Sit down. Stand up. Wave hello. Follow me. Come here. Go home. Look around. What do you see.",
"_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.", "_dispatch_comment": "Motion command dispatch side-channel. Marcus listens to Gemini's input_transcription; if the text contains a wake-word variant AND the remainder fuzzy-matches a canonical phrase in command_vocab at >= command_vocab_cutoff, Marcus fires on_command() in parallel to Gemini's verbal reply. Dedup on the canonical form within command_cooldown_sec prevents streaming partials from double-firing.",
"command_vocab_cutoff": 0.72,
"command_cooldown_sec": 1.5,
"min_transcription_length": 3,
"_vocab_comment": "wake_words = variants Gemini may produce for 'Sanad' — word-boundary matched in the user transcript. command_vocab = canonical command phrases. The dispatcher fuzzy-matches the transcript (after wake-word strip) against command_vocab. garbage_patterns lists short noise phrases Gemini sometimes emits — rejected before fuzzy-match unless they happen to equal a vocab entry exactly. Edit these to add new vocabulary — NO code change required.",
"wake_words": [ "wake_words": [
"sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad", "sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
"sanat", "sunnat", "sonnat", "sinnat", "sennat", "sanat", "sunnat", "sonnat", "sinnat", "sennat",
@ -52,7 +54,6 @@
"thanad", "zanad", "thanad", "zanad",
"sa nad", "san ad", "san odd", "san add" "sa nad", "san ad", "san odd", "san add"
], ],
"_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.",
"command_vocab": [ "command_vocab": [
"what do you see", "what can you see", "look around", "what do you see", "what can you see", "look around",
"come to me", "come here", "come back", "come closer", "come to me", "come here", "come back", "come closer",
@ -73,8 +74,6 @@
"remember this", "forget", "do it again", "repeat", "undo", "remember this", "forget", "do it again", "repeat", "undo",
"follow me", "stay here" "follow me", "stay here"
], ],
"command_vocab_cutoff": 0.72,
"_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.",
"garbage_patterns": [ "garbage_patterns": [
"thanks for watching", "thank you for watching", "thanks for watching", "thank you for watching",
"thank you", "thanks", "thank you", "thanks",
@ -83,72 +82,41 @@
"okay", "ok", "okay", "ok",
"um", "uh", "hmm", "mm", "um", "uh", "hmm", "mm",
"i", "a" "i", "a"
], ]
"min_transcription_length": 3,
"_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
"speech_threshold": 200.0,
"min_word_duration": 0.25,
"max_word_duration": 2.50,
"post_silence": 0.20,
"wake_cooldown": 1.00,
"wake_chunk_ms": 50,
"wake_adaptive_window_n": 50,
"wake_adaptive_mult": 2.0,
"wake_diag_log_sec": 3.0,
"wake_ack": "tts",
"_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",
"_wake_verify_comment": "ENABLED — confirm each acoustic wake with a lightweight Whisper decode on the triggering burst BEFORE speaking 'Yes' / opening the record window. Accept if the transcription contains any wake-word variant OR starts with s/sh/z (Whisper's consistent signature for mishearing 'Sanad' as 'Stop', 'Sand', 'Set', etc.). Reject pure noise (empty whisper) and clearly non-/sa-/ speech silently. Cost: ~1-2s extra per wake for the Whisper decode. Trade-off: slower wake response, but no 'Yes' played on stray loud noises. Set false if you prefer speed over false-wake filtering.",
"wake_verify_enabled": true,
"_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
"speech_entry_rms": 400.0,
"silence_exit_rms": 200.0,
"_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.",
"silence_duration_sec": 0.6,
"max_record_sec": 5.0,
"min_record_sec": 0.4,
"ambient_probe_sec": 0.2,
"ambient_mult": 1.5,
"ambient_cap_rms": 200.0,
"_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.",
"recording_enabled": true,
"recording_keep_count": 50,
"command_cooldown_sec": 1.5,
"post_tts_settle_sec": 0.4,
"_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'."
}, },
"mic": { "mic": {
"_comment": "Used by API/audio_api.py::record() for non-Gemini capture (e.g. ad-hoc recording commands from other subsystems). Gemini reads the mic via Voice/audio_io.py BuiltinMic directly.",
"backend": "builtin_udp", "backend": "builtin_udp",
"source_index": "3", "source_index": "3",
"format": "s16le", "format": "s16le",
"rate": 16000, "rate": 16000,
"channels": 1 "channels": 1
}, },
"mic_udp": { "mic_udp": {
"_comment": "G1 on-board mic multicast parameters. Consumed by Voice/audio_io.py BuiltinMic.",
"group": "239.168.123.161", "group": "239.168.123.161",
"port": 5555, "port": 5555,
"buffer_max_bytes": 64000, "buffer_max_bytes": 64000,
"read_timeout_sec": 0.04 "read_timeout_sec": 0.04
}, },
"speaker": { "speaker": {
"_comment": "G1 on-board speaker parameters. dds_interface is the robot's DDS NIC; app_name is the stream label used by AudioClient.PlayStream.",
"dds_interface": "eth0", "dds_interface": "eth0",
"volume": 100, "volume": 100,
"app_name": "sanad" "app_name": "sanad",
"begin_stream_pause_sec": 0.15,
"wait_finish_margin_sec": 0.3
}, },
"audio": { "audio": {
"data_dir": "Data/Voice/Recordings", "data_dir": "Data/Voice/Recordings",
"log_file": "logs/voice.log" "log_file": "logs/voice.log"
}, },
"messages": { "messages": {
"wake_heard": "Yes",
"no_speech": "I didn't catch that, please say it again",
"error_tts": "Speech synthesis failed",
"error_mic": "Microphone error",
"ready": "Voice system ready" "ready": "Voice system ready"
} }
} }

View File

@ -0,0 +1,250 @@
{
"_description": "Gemini action dispatch — maps spoken phrases to canonical motion commands. Mirrors Sanad's scripts/sanad_arm.txt pattern (Project/Sanad/scripts/sanad_arm.txt) but in JSON with action groups instead of a Python-set file. When stt.backend='gemini', Voice/marcus_voice.py::_dispatch_gemini_command matches the user's transcript (after stripping 'Sanad') against 'phrases' under each action and fires on_command with the action's 'canonical' string. Edit this file to add new spoken variants WITHOUT touching code.",
"_format": "actions.<action_key>.phrases — array of spoken variants (lowercase, punctuation stripped). Match is whole-word, case-insensitive. One phrase hit = fire.\nactions.<action_key>.canonical — the string passed to self._on_command(text, 'en'). Must be a recognised command in Brain/command_parser.py.\nactions.<action_key>.description — human-only; dispatcher ignores it.\nNon-motion conversation ('how are you', 'who are you', 'what do you see') is NOT listed here — Gemini answers those naturally via voice. Only physical actions live in this file.",
"settings": {
"_comment": "Dispatcher behaviour. require_wake_word=true means the transcript must contain 'Sanad' (or a fuzzy variant from stt.wake_words) before any phrase is considered — matches the current Marcus persona rule. fire_on_wake_match=true fires the action instantly on transcript; false defers until Gemini's turn_complete (robot speaks the acknowledgement first, then moves) — mirrors Sanad's fire_on_wake_match flag in voice/text_utils.maybe_trigger_arm.",
"trigger_enabled": true,
"require_wake_word": true,
"fire_on_wake_match": true,
"stream_buffer_sec": 2.0,
"dedup_window_sec": 2.0,
"repeat_suppress_sec": 0.25,
"pending_action_ttl_sec": 6.0
},
"actions": {
"turn_left": {
"canonical": "turn left",
"description": "Rotate in place 90° to the left.",
"phrases": [
"turn left",
"rotate left",
"spin left",
"go left",
"face left"
]
},
"turn_right": {
"canonical": "turn right",
"description": "Rotate in place 90° to the right.",
"phrases": [
"turn right",
"rotate right",
"spin right",
"go right",
"face right"
]
},
"turn_around": {
"canonical": "turn around",
"description": "Rotate 180°.",
"phrases": [
"turn around",
"turn back",
"spin around",
"about face",
"face the other way"
]
},
"move_forward": {
"canonical": "move forward",
"description": "Walk forward one step interval.",
"phrases": [
"move forward",
"go forward",
"walk forward",
"step forward",
"forward",
"keep going",
"walk ahead"
]
},
"move_back": {
"canonical": "move backward",
"description": "Walk backward one step interval.",
"phrases": [
"move back",
"move backward",
"go back",
"go backward",
"walk back",
"walk backward",
"step back",
"backward",
"reverse"
]
},
"step_left": {
"canonical": "move left",
"description": "Sidestep left.",
"phrases": [
"step left",
"move left",
"slide left",
"strafe left"
]
},
"step_right": {
"canonical": "move right",
"description": "Sidestep right.",
"phrases": [
"step right",
"move right",
"slide right",
"strafe right"
]
},
"stop": {
"canonical": "stop",
"description": "Halt current motion immediately.",
"phrases": [
"stop",
"halt",
"wait",
"pause",
"freeze",
"hold",
"stop moving",
"stand still",
"don't move"
]
},
"sit_down": {
"canonical": "sit down",
"description": "Sit down to the ground from standing.",
"phrases": [
"sit down",
"sit",
"take a seat",
"have a seat"
]
},
"stand_up": {
"canonical": "stand up",
"description": "Stand up from sitting.",
"phrases": [
"stand up",
"stand",
"get up",
"rise"
]
},
"wave_hello": {
"canonical": "wave hello",
"description": "Wave with the right arm.",
"phrases": [
"wave hello",
"wave",
"say hi",
"greet",
"wave to me",
"wave at me"
]
},
"raise_arm": {
"canonical": "raise arm",
"description": "Raise the right arm straight up.",
"phrases": [
"raise arm",
"raise your arm",
"lift your arm",
"arm up",
"hand up"
]
},
"lower_arm": {
"canonical": "lower arm",
"description": "Return the arm to the resting position.",
"phrases": [
"lower arm",
"lower your arm",
"drop your arm",
"arm down",
"hand down",
"rest your arm"
]
},
"point": {
"canonical": "point",
"description": "Point with the right arm (used after 'look at ...').",
"phrases": [
"point",
"point at it",
"point to it",
"point there"
]
},
"come_here": {
"canonical": "come here",
"description": "Approach the speaker.",
"phrases": [
"come here",
"come to me",
"come closer",
"approach",
"get closer",
"come over here"
]
},
"follow_me": {
"canonical": "follow me",
"description": "Follow the speaker until told to stop.",
"phrases": [
"follow me",
"come with me",
"walk with me"
]
},
"stay_here": {
"canonical": "stay here",
"description": "Stop following and hold position.",
"phrases": [
"stay here",
"stay",
"wait here",
"hold position",
"don't follow me"
]
},
"go_home": {
"canonical": "go home",
"description": "Return to the home position.",
"phrases": [
"go home",
"return home",
"head home",
"go back home"
]
},
"patrol": {
"canonical": "patrol",
"description": "Start the patrol routine.",
"phrases": [
"patrol",
"start patrol",
"begin patrol",
"patrol the area",
"walk the route"
]
},
"look_around": {
"canonical": "look around",
"description": "Scan the environment (vision sweep).",
"phrases": [
"look around",
"scan the room",
"scan around",
"survey the area",
"have a look around"
]
}
}
}

View File

@ -29,7 +29,7 @@
| Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. | | Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
| Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. | | Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
| Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. | | Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
| Voice stack finalised | `Voice/marcus_voice.py`, `Voice/wake_detector.py` | Custom energy wake detector (pure numpy) + Whisper verify + faster-whisper command STT + fuzzy-match to canonical commands. Vosk experiment reverted; Gemini Live reverted. Single local STT engine. | | Voice stack — Gemini Live STT + TtsMaker hybrid | `Voice/audio_io.py`, `Voice/gemini_script.py`, `Voice/turn_recorder.py`, `Voice/marcus_voice.py` | Sanad-pattern port: `AudioIO.from_profile("builtin", audio_client=ac)` builds the G1 mic + speaker; `GeminiBrain` runs Gemini Live `response_modalities=["TEXT"]` in a worker thread; `_dispatch_gemini_command` gates each transcript on the wake word "Sanad" + fuzzy match against `command_vocab` then forwards to the brain. The brain's reply is spoken by `AudioAPI.speak()` via on-robot TtsMaker — Gemini never speaks. Earlier iterations (faster-whisper / wake_detector / Vosk / Moonshine / full S2S) all removed. Cloud dep: env `MARCUS_GEMINI_API_KEY`. |
| Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. | | Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
| Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. | | Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
| Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. | | Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |
@ -766,27 +766,31 @@ SAFETY:
--- ---
## 15. Voice API (mic + TTS + wake + STT) ## 15. Voice API (mic + Gemini Live STT + TtsMaker)
Current pipeline: G1 mic → custom energy wake detector → Whisper verify → TtsMaker "Yes" → record → faster-whisper transcribe → fuzzy-match canonical command → brain. Replaces all prior experiments (Gemini Live WebSocket, Vosk grammar, edge-tts / Piper). Current pipeline: G1 mic → Gemini Live (`response_modalities=["TEXT"]`) → input_transcription → wake-word gate + fuzzy match → brain → on-robot TtsMaker reply. Sanad-pattern port; only cloud dependency is the Gemini API key. Replaces all prior local-STT attempts (Whisper / Moonshine / Vosk / wake_detector). The full Sanad-style speech-to-speech mode (Gemini speaks back) was tested and removed — TtsMaker as the single voice owner avoids the audio-collision class.
### Mic `Voice.builtin_mic.BuiltinMic` ### Mic + Speaker bundle — `Voice.audio_io.AudioIO`
Captures the G1's on-board array microphone over UDP multicast. No USB mic required. 16 kHz mono int16 PCM natively; no resampling needed. Sanad-pattern factory. `BuiltinMic` joins the G1's UDP multicast audio (16 kHz mono int16). `BuiltinSpeaker` wraps `AudioClient.PlayStream` with 24→16 kHz resampling (built but idle in STT-only mode; TtsMaker owns the speaker via a separate firmware API).
```python ```python
from Voice.builtin_mic import BuiltinMic from Voice.audio_io import AudioIO
mic = BuiltinMic(group="239.168.123.161", port=5555, buf_max=64_000)
mic.start() audio = AudioIO.from_profile("builtin", audio_client=ac)
audio.start()
try: try:
pcm = mic.read_chunk(1024) # 512 samples, ~32 ms, int16 mono pcm = audio.mic.read_chunk(1024) # 512 samples, ~32 ms
# or audio.mic.flush()
pcm = mic.read_seconds(3.0)
finally: finally:
mic.stop() audio.stop()
``` ```
Config under `config_Voice.json::mic_udp`. Config under `config_Voice.json::{mic_udp, speaker}`.
### Mic shim — `Voice.builtin_mic.BuiltinMic`
Backward-compat shim. Subclasses `audio_io.BuiltinMic` and adds `read_seconds(s)` for `AudioAPI.record()`. Old imports of `from Voice.builtin_mic import BuiltinMic` keep working.
### TTS — `Voice.builtin_tts.BuiltinTTS` ### TTS — `Voice.builtin_tts.BuiltinTTS`
@ -795,41 +799,62 @@ Wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker`. E
```python ```python
from Voice.builtin_tts import BuiltinTTS from Voice.builtin_tts import BuiltinTTS
tts = BuiltinTTS(audio_client, default_speaker_id=0) tts = BuiltinTTS(audio_client, default_speaker_id=0)
tts.speak("Hello, I am Sanad", block=True) # synth + play on G1 body speaker tts.speak("Hello, I am Sanad", block=True)
``` ```
Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly. Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
### Wake detection — `Voice.wake_detector.WakeDetector` ### Gemini Live STT — `Voice.gemini_script.GeminiBrain`
Pure-numpy energy state machine with adaptive noise floor. Classifies any 0.35-1.5 s speech burst as a candidate wake, captures the audio for post-hoc verification. Direct port of Sanad's `gemini/script.py`, configured with `response_modalities=["TEXT"]` so Gemini transcribes but never speaks. Reconnect-safe: 660 s session timeout, exponential backoff cap 30 s, client recreated after 10 consecutive errors. Runs an asyncio loop inside a worker thread; sync `start()/stop()` wrappers.
```python ```python
from Voice.wake_detector import WakeDetector, WakeConfig from Voice.audio_io import AudioIO
cfg = WakeConfig( from Voice.turn_recorder import TurnRecorder
sample_rate=16_000, from Voice.gemini_script import GeminiBrain
speech_threshold=400.0, # min RMS floor — above noise
min_word_duration_s=0.35, # filter out coughs (<0.35s) audio = AudioIO.from_profile("builtin", audio_client=ac)
max_word_duration_s=1.50, # filter out sentences audio.start()
post_silence_s=0.30, # how long silence marks word end rec = TurnRecorder(enabled=True, out_dir="Data/Voice/Recordings/gemini_turns")
cooldown_s=1.50, # min gap between fires
chunk_ms=50, # RMS analysis window def on_transcript(text):
adaptive_window_n=50, # rolling mean of idle RMS print("USER:", text)
adaptive_mult=3.0, # effective = max(floor, baseline×mult)
def on_command(text, lang):
print("dispatch:", text)
brain = GeminiBrain(
audio, rec, voice_name="Charon",
system_prompt="...transcriber-role prompt...",
api_key=os.environ["MARCUS_GEMINI_API_KEY"],
on_transcript=on_transcript,
on_command=on_command,
) )
det = WakeDetector(cfg) brain.start()
while True: # ... later ...
pcm = mic.read_chunk(1024) brain.stop()
if det.process(pcm): audio.stop()
burst = det.get_last_burst() # audio that triggered wake
break
``` ```
Config under `config_Voice.json::stt.{speech_threshold, min_word_duration, …}`. Config under `config_Voice.json::stt.gemini_*` — model, voice, VAD sensitivity, session lifecycle, persona, recording.
### Per-turn recorder — `Voice.turn_recorder.TurnRecorder`
Saves `<ts>_user.wav` per turn plus an `index.json` entry with both transcripts. In STT-only mode, no `<ts>_robot.wav` is written (Gemini emits text, not audio).
```python
from Voice.turn_recorder import TurnRecorder
rec = TurnRecorder(enabled=True, out_dir="Data/Voice/Recordings/gemini_turns",
user_rate=16000, robot_rate=24000)
rec.capture_user(pcm_bytes)
rec.add_user_text("Sanad, turn right")
rec.add_robot_text("Turning right") # Gemini's text reply (recorded for review, not spoken)
rec.finish_turn() # → 20260425_120000_user.wav + index.json append
```
### Voice orchestrator — `Voice.marcus_voice.VoiceModule` ### Voice orchestrator — `Voice.marcus_voice.VoiceModule`
Drives the full pipeline: wake detector → Whisper verify → record → transcribe → fuzzy-match → dispatch. Three operating modes (`wake_and_command`, `always_on`, `always_on_gated`) selectable via `stt.mode`. Drives the full pipeline: builds AudioIO + TurnRecorder + GeminiBrain, gates each transcript on the wake word "Sanad", strips it, fuzzy-matches against `command_vocab`, dedups partial transcripts within `command_cooldown_sec`, then forwards the cleaned text to the user-supplied `on_command` callback.
```python ```python
from API.audio_api import AudioAPI from API.audio_api import AudioAPI
@ -837,17 +862,20 @@ from Voice.marcus_voice import VoiceModule
def on_command(text, lang): def on_command(text, lang):
print(f"heard: {text}") print(f"heard: {text}")
# return or call audio_api.speak(reply); flush_mic() is automatic in marcus_brain
audio = AudioAPI() audio = AudioAPI()
voice = VoiceModule(audio, on_command=on_command) voice = VoiceModule(audio, on_command=on_command)
voice.start() # background thread voice.start()
# ... later ... # ... later ...
voice.stop() voice.stop()
``` ```
Vocabulary (`wake_words`, `command_vocab`, `garbage_patterns`) is loaded from `config_Voice.json::stt.*` at `VoiceModule.__init__`. All thresholds, Whisper params, and mode selection live in the same config — no Python edits required to tune. See `Doc/controlling.md` → "Voice" for the tuning-knobs cheat sheet. Vocabulary (`wake_words`, `command_vocab`, `garbage_patterns`) is loaded from `config_Voice.json::stt.*` at `VoiceModule.__init__`. All Gemini tunables (model, VAD, session timeouts, persona) live in the same config — no Python edits required. See `Doc/controlling.md` → "Voice" for the tuning-knobs cheat sheet.
The brain's `_init_voice()` wires `on_command` to `process_command(text)``audio_api.speak(reply)`. `flush_mic()` is a public hook that `Brain/marcus_brain._on_command` calls before AND after `audio_api.speak(reply)` so TtsMaker output isn't transcribed back into Gemini as a fake user utterance.
The brain's `init_voice()` wires `on_command` to `process_command(text)``flush_mic()``audio_api.speak(reply)``flush_mic()`.
### AudioAPI — `API.audio_api.AudioAPI` ### AudioAPI — `API.audio_api.AudioAPI`

View File

@ -13,9 +13,9 @@
- **Ollama compute-graph caps**`num_batch=128`, `num_ctx=2048` in `config_Brain.json` (otherwise llama.cpp OOMs on the 16 GB Jetson). - **Ollama compute-graph caps**`num_batch=128`, `num_ctx=2048` in `config_Brain.json` (otherwise llama.cpp OOMs on the 16 GB Jetson).
- **`num_predict_main: 120`** (was 200) — saves ~400-600 ms per open-ended command. - **`num_predict_main: 120`** (was 200) — saves ~400-600 ms per open-ended command.
- **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import. - **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555``Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic. - **G1 built-in microphone** via UDP multicast `239.168.123.161:5555`defined in `Voice/audio_io.py::BuiltinMic` (Sanad-pattern port). `Voice/builtin_mic.py` is a thin backward-compat shim used by `API/audio_api.record()`.
- **G1 built-in TTS** via `client.TtsMaker()``Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed. - **G1 built-in TTS** via `client.TtsMaker()``Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
- **Voice stack finalised** — custom energy wake detector (`Voice/wake_detector.py`) + faster-whisper command STT (`Voice/marcus_voice.py`). Whisper verifies each acoustic wake before acking. Gemini voice module and Vosk grammar STT both tried and removed. - **Voice stack — Gemini Live STT + TtsMaker hybrid (subprocess split)** — `google-genai` requires Python ≥3.9 but the marcus env is pinned to Python 3.8 by the NVIDIA Jetson torch wheel, so the actual Gemini WebSocket runs in a **separate Python 3.10+ subprocess** (`Voice/gemini_runner.py`, executed under the `gemini_sdk` conda env). The marcus parent (Python 3.8) spawns it via `Voice/gemini_script.py::GeminiBrain` and parses JSON-line transcripts on stdout. `Voice/marcus_voice.py::_dispatch_gemini_command` gates each transcript on the wake word "Sanad" + fuzzy match against `stt.command_vocab`, then forwards to `Brain.marcus_brain.process_command(...)`. The brain's reply is spoken by the on-robot `TtsMaker` — Gemini never speaks. Same pattern Sanad uses (it parses log lines from a Gemini subprocess too). Earlier in-process attempts (faster-whisper / Vosk / Moonshine / Gemini Live in marcus 3.8 / full Gemini speech-to-speech) were all tried and removed.
- **Subsystem flags**`config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages. - **Subsystem flags**`config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
- **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps. - **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
- **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows. - **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
@ -66,7 +66,7 @@ Marcus/
│ ├── config_Memory.json # session/places paths │ ├── config_Memory.json # session/places paths
│ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports │ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports
│ ├── config_ImageSearch.json # search defaults │ ├── config_ImageSearch.json # search defaults
│ ├── config_Voice.json # mic, TTS, wake detector thresholds, Whisper params, wake_words/command_vocab/garbage_patterns vocab lists, VAD thresholds │ ├── config_Voice.json # mic, TTS, Gemini Live STT params (model, VAD sensitivities, session timeouts), wake_words/command_vocab/garbage_patterns vocab lists used by the dispatch gate
│ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params │ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params
│ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify, 2× imgsearch) │ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify, 2× imgsearch)
│ # Total: 12 JSON files + 1 YAML. (config_Memory.json removed 2026-04-21.) │ # Total: 12 JSON files + 1 YAML. (config_Memory.json removed 2026-04-21.)
@ -83,11 +83,14 @@ Marcus/
│ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic │ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
│ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status() │ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
├── Voice/ # Mic + TTS + wake detector + faster-whisper STT ├── Voice/ # Audio I/O + Gemini Live STT (subprocess) + TtsMaker glue
│ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555 │ ├── audio_io.py # Mic/Speaker ABCs + BuiltinMic (UDP multicast) + BuiltinSpeaker (PlayStream) + AudioIO.from_profile (Sanad pattern)
│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id) │ ├── builtin_mic.py # Backward-compat shim — subclasses audio_io.BuiltinMic + adds read_seconds() for AudioAPI.record()
│ ├── wake_detector.py # Pure-numpy energy wake detector (WakeDetector, WakeConfig) with adaptive baseline │ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id) (used by AudioAPI.speak)
│ └── marcus_voice.py # VoiceModule — orchestrates wake → verify → record → Whisper → dispatch │ ├── gemini_runner.py # Subprocess script (Python 3.10+, gemini_sdk env) — opens Gemini Live, owns mic + WAV recorder, emits JSON-line transcripts on stdout
│ ├── gemini_script.py # GeminiBrain — subprocess MANAGER (Python 3.8). Spawns gemini_runner.py, reads stdout, fires on_transcript / on_command. Provides flush_mic() over stdin.
│ ├── turn_recorder.py # TurnRecorder — used by the runner to save <ts>_user.wav + index.json
│ └── marcus_voice.py # VoiceModule — spawns GeminiBrain, runs the wake-word dispatch gate
├── Brain/ # Decision logic — imports ONLY from API/ ├── Brain/ # Decision logic — imports ONLY from API/
│ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal() │ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal()
@ -186,13 +189,14 @@ Marcus/
│ wraps │ wraps │ wraps │ wraps
┌──────────────▼───────────┐ ┌────────▼────────────────┐ ┌──────────────▼───────────┐ ┌────────▼────────────────┐
│ Navigation / Vision │ │ Voice │ │ Navigation / Vision │ │ Voice │
│ goal_nav.py │ │ builtin_mic.py │ │ goal_nav.py │ │ audio_io.py │
│ patrol.py │ │ builtin_tts.py │ │ patrol.py │ │ gemini_script.py │
│ marcus_odometry.py │ │ marcus_voice.py │ │ marcus_odometry.py │ │ turn_recorder.py │
│ marcus_yolo.py │ │ wake_detector.py │ │ marcus_yolo.py │ │ marcus_voice.py │
│ │ │ (Whisper + TtsMaker) │ │ │ │ builtin_tts.py │
│ marcus_imgsearch.py │ └──────────┬──────────────┘ │ marcus_imgsearch.py │ │ (Gemini STT + TtsMaker)│
└──────────────┬───────────┘ │ └──────────────┬───────────┘ └──────────┬──────────────┘
│ │
│ │ │ │
┌──────────────▼─────────────────────────▼────────────┐ ┌──────────────▼─────────────────────────▼────────────┐
│ Core Layer │ │ Core Layer │
@ -489,53 +493,55 @@ Supports text-only search (no reference image) using hint description.
### Voice/ ### Voice/
Mic, TTS, energy-based wake detector, and faster-whisper STT pipeline. All files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable). Audio I/O + Gemini Live STT + TtsMaker glue. All files run only when `config_Brain.json::subsystems.voice == true`. The voice path is the **single cloud dependency** in Marcus — Gemini Live transcribes the user's mic; everything else (TTS, brain, vision, motion) stays on the Jetson. TTS is English-only by design (the G1 firmware silently maps non-English to Chinese).
#### `builtin_mic.py` (~180 lines) The Voice/ layout mirrors `Project/Sanad/voice/` (Mic/Speaker/AudioIO factory + TurnRecorder + GeminiBrain) — class names and method signatures match Sanad verbatim. Only the brain configuration differs: Marcus uses `response_modalities=["TEXT"]` (STT-only) while Sanad uses `["AUDIO"]` (full speech-to-speech).
Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer. Identical pattern to `Project/Sanad/voice/audio_io.py::BuiltinMic`.
**Exports:** #### `audio_io.py` (~345 lines)
- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent) Sanad-pattern hardware abstraction. Defines `Mic` and `Speaker` ABCs, the G1-specific `BuiltinMic` (UDP multicast subscriber, `239.168.123.161:5555`, 32 ms chunks, thread-safe ring buffer), `BuiltinSpeaker` (streaming wrapper around `AudioClient.PlayStream` with 24→16 kHz resample), and the `AudioIO.from_profile("builtin", audio_client=ac)` factory. `BuiltinSpeaker` is built in STT-only mode but never driven — TtsMaker owns the speaker via a separate G1 firmware API.
- `start()` / `stop()` — socket lifecycle
- `read_chunk(n)` — pull exactly `n` bytes (blocks up to `read_timeout`, pads silence otherwise)
- `read_seconds(s)` — convenience for "record `s` seconds"
- `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
#### `builtin_tts.py` (~70 lines) **Exports:** `Mic`, `Speaker`, `BuiltinMic`, `BuiltinSpeaker`, `AudioIO`, `_resample_int16`, `_as_int16_array`.
Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input.
**Exports:** #### `builtin_mic.py` (~58 lines)
- `BuiltinTTS(audio_client, default_speaker_id=0)` — init Backward-compat shim. Subclasses `audio_io.BuiltinMic` and adds `read_seconds(s)` for `API/audio_api.record()`. Old imports of `from Voice.builtin_mic import BuiltinMic` keep working. New code should import `audio_io.BuiltinMic` directly.
- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
#### `wake_detector.py` (~240 lines) #### `builtin_tts.py` (~120 lines)
Pure-numpy energy-envelope state machine. Fires a wake event when it sees a short speech burst (0.2-1.5 s) sized to match a single spoken word like "Sanad", followed by a clear silence. No ML, no lexicon — just amplitude classification. Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Used by `API/audio_api.speak()` to render the brain's spoken replies. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input.
Adaptive noise-floor baseline: learns ambient RMS during idle, raises the effective threshold proportionally, so the detector works the same in a quiet room and a noisy lab. Captures the triggering burst audio (`get_last_burst()`) so callers can verify it was actually "Sanad" before acking. Exists because Vosk/Whisper both failed on the G1 far-field mic for short non-English proper nouns.
**Exports:** **Exports:** `BuiltinTTS(audio_client, default_speaker_id=0)`, `.speak(text, speaker_id=None, block=True)`.
- `WakeDetector(cfg)` with `WakeConfig(sample_rate, speech_threshold, min_word_duration_s, max_word_duration_s, post_silence_s, cooldown_s, chunk_ms, adaptive_window_n, adaptive_mult, diag_log_sec)`
- `process(pcm_bytes) -> bool` — feed audio, returns True once per spoken "word"
- `reset()`, `get_last_burst() -> np.ndarray | None`
#### `marcus_voice.py` (~1000 lines) #### `gemini_script.py` (~458 lines)
Voice orchestrator. Reads from `BuiltinMic`, runs the `WakeDetector`, verifies the wake burst with a lightweight Whisper decode, records the command with hysteretic VAD (speech_entry / silence_exit thresholds, adaptive to measured ambient), trims leading silence before Whisper, transcribes with faster-whisper, fuzzy-matches against `command_vocab` to canonicalize near-misses ("Turn right up" → "turn right"), then dispatches to the brain callback. The STT brain. `GeminiBrain` opens a Gemini Live session over WebSocket (`google-genai` SDK) configured with `response_modalities=["TEXT"]` and `input_audio_transcription`. A `_send_mic_loop` coroutine streams 512-sample int16 PCM blobs at 16 kHz; a `_receive_loop` coroutine extracts `server_content.input_transcription.text` and fires `on_transcript` + `on_command` callbacks. No audio comes back — Gemini's text reply is logged but never played.
Three operating modes selectable via `stt.mode`: Reconnect-safe: 660 s session timeout, exponential backoff (cap 30 s), client recreated after 10 consecutive errors, 30 s no-message dead-session detector. All values match Sanad's `voice_config.json::sanad_voice`.
- `wake_and_command` (default): classic acoustic wake → TTS "Yes" → record → Whisper → brain
- `always_on`: no wake, transcribe every utterance, dispatch all
- `always_on_gated`: transcribe everything, only dispatch utterances containing "Sanad"
Wake verify rule: Whisper's decode must either contain a wake-word variant (`stt.wake_words`) OR start with `s/sh/z` — Whisper's consistent signature for mishearing "Sanad" as "Stop"/"Set"/"Sand". Pure silence / non-s speech is rejected silently. `start()/stop()` are synchronous wrappers that run `async run()` inside a worker thread's asyncio loop — Marcus's `VoiceModule` is threaded, so this adapter is the only Marcus-specific addition vs Sanad's structure.
**Module-level** (populated at `VoiceModule.__init__` from config): **Exports:** `GeminiBrain(audio_io, recorder, voice_name, system_prompt, *, api_key, on_transcript, on_command)` + `start()/stop()`.
- `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` — loaded from `config_Voice.json::stt.*`, single source of truth
- `_has_wake_word(text)`, `_strip_wake_word(text)` — iterative until stable, handles "Sanad. Sanad." → "" #### `turn_recorder.py` (~158 lines)
Per-turn WAV saver. `capture_user(pcm)` and `add_user_text(text)` buffer in RAM until `finish_turn()` flushes one `<ts>_user.wav` (16 kHz int16 mono) plus an `index.json` entry per turn with `user_text` + `robot_text` (Gemini's text reply, kept for review even though never spoken). In STT-only mode, `<ts>_robot.wav` is **not** written — there is no PCM coming back from Gemini to capture; the actual robot voice is generated on demand by TtsMaker and never flows through this recorder.
**Exports:** `TurnRecorder(enabled, out_dir, user_rate, robot_rate)` + `capture_user`, `capture_robot`, `add_user_text`, `add_robot_text`, `finish_turn`.
#### `marcus_voice.py` (~450 lines)
Voice orchestrator. `VoiceModule.__init__` loads `WAKE_WORDS / COMMAND_VOCAB / GARBAGE_PATTERNS` from `config_Voice.json::stt.*`. `_voice_loop_gemini` builds `AudioIO.from_profile("builtin", audio_client=ac)`, instantiates `TurnRecorder`, then constructs and starts a `GeminiBrain` with two callbacks:
- `on_transcript(text)` → writes a `HEARD ...` line to `logs/transcript.log`.
- `on_command(text, "en")``_dispatch_gemini_command`: gates on `_has_wake_word(text)` (must contain "Sanad" or a fuzzy variant), strips the wake word, fuzzy-matches against `command_vocab` for canonicalization (e.g. "Turn right up" → "turn right"), dedups partial transcripts within `command_cooldown_sec`, then forwards the cleaned text to `Brain.marcus_brain.process_command(...)` via the user's `on_command` callback.
`flush_mic()` drops any buffered mic audio — called by `Brain/marcus_brain._on_command` before AND after `_audio_api.speak(reply)` so TtsMaker output isn't transcribed back into Gemini as a fake user utterance.
**Module-level** (populated at `__init__` from config):
- `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` — single source of truth
- `_has_wake_word(text)`, `_strip_wake_word(text)` — iterative; handles "Sanad. Sanad." → ""
- `_closest_command(text, cutoff)` — difflib fuzzy-match against `COMMAND_VOCAB` - `_closest_command(text, cutoff)` — difflib fuzzy-match against `COMMAND_VOCAB`
**Exports:** **Exports:**
- `VoiceModule(audio_api, on_command=cb, on_wake=None)` — init - `VoiceModule(audio_api, on_command=cb, on_wake=None)` — init
- `start()` / `stop()` — background thread lifecycle - `start()` / `stop()` — background thread lifecycle
- `is_running` property - `flush_mic()` — public hook for echo prevention around speak()
- `is_speaking` property — delegates to `AudioAPI.is_speaking`
--- ---

View File

@ -79,29 +79,30 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
## Voice ## Voice
- **Wake word:** "Sanad" (Whisper mishears it as "Stop", "Sand", "Set", "Send" — all accepted via the /s-/ phonetic rule; see `config_Voice.json::stt.wake_words` for the 33 fuzzy variants). - **Wake word:** "Sanad" — gated at dispatch time on Gemini's transcript. Common mishearings ("Sannad", "Senad", "Sa nad", etc.) all accepted via the 33-entry `config_Voice.json::stt.wake_words` fuzzy list. Word-boundary match, not substring (so "standard" doesn't trigger off "sand").
- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed. - **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic, no acoustic wake detector.
- **Wake detection:** custom energy-envelope state machine (pure numpy, no ML) — fires on any 0.35-1.5 s speech burst followed by silence. Adaptive to room ambient. - **STT:** Gemini Live (`gemini-2.5-flash-native-audio-preview-12-2025`) with `response_modalities=["TEXT"]` — Gemini does the transcription. The mic is streamed in 32 ms chunks; Gemini's server-side VAD decides turn boundaries. **The Gemini WebSocket runs in a separate Python 3.10+ subprocess** (`Voice/gemini_runner.py`) because `google-genai` doesn't support Python 3.8 (which marcus is pinned to). Marcus spawns the runner via the `gemini_sdk` conda env and reads JSON-line transcripts off its stdout. Requires `pip install google-genai` **inside the gemini_sdk env** (not the marcus env) and an API key in `MARCUS_GEMINI_API_KEY` (or `SANAD_GEMINI_API_KEY` fallback). Set `MARCUS_GEMINI_PYTHON` (or `stt.gemini_python_path`) if the gemini_sdk env lives somewhere besides `~/miniconda3/envs/gemini_sdk/`.
- **Wake verify:** lightweight Whisper decode on the triggering burst. Accepts if it contains a wake-word variant OR starts with `s`/`sh`/`z` (Whisper's consistent signature for "Sanad"). Rejects pure noise / non-s speech silently. - **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only. Gemini does NOT speak — only Marcus's brain reply is spoken, via TtsMaker.
- **STT (command):** faster-whisper `base.en` int8 on CPU — loads ~1.5 s on first wake, cached after. - **Echo prevention:** `VoiceModule.flush_mic()` is called by Marcus's brain before AND after `audio_api.speak()` so TtsMaker output isn't transcribed back into Gemini as a fake user utterance.
- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
- **Barge-in:** the mic is muted during TTS playback, then flushed on return to listening.
Interaction flow: say "Sanad" → hear *"Yes"* → speak your command → see transcript on console → Marcus answers through the speaker. Interaction flow: speak "Sanad" + your request → Gemini transcribes (Marcus prints `USER: ...`) → wake-word gate passes → brain handles it (motion, VLM Q&A, place memory, …) → reply spoken through G1 speaker.
Three voice modes selectable via `config_Voice.json::stt.mode`: Examples:
- `wake_and_command` (default) — wake word required before each command - "Sanad, turn right" → robot turns right, brain says "Done"
- `always_on` — continuously transcribe + dispatch every utterance - "Sanad, what do you see" → Qwen2.5-VL describes the camera frame, brain speaks the description
- `always_on_gated` — always listen + log, dispatch only if utterance contains "Sanad" - "Sanad" alone (no payload) → no dispatch (the persona prompt tells Gemini to acknowledge silently)
- "what do you see" (no "Sanad") → wake-word gate blocks, no dispatch, no reply (avoids false motion from background chatter)
To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster. To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only without opening the Gemini WebSocket.
**Tuning knobs** (when false wakes or rejected real wakes) — all in `config_Voice.json::stt`: **Tuning knobs** — all in `config_Voice.json::stt`:
- Too many false wakes from coughs/claps → raise `speech_threshold` or `min_word_duration` - Real "Sanad" misheard by Gemini and not matching wake_words → check `logs/transcript.log` for the `HEARD` line, add the variant to `wake_words`
- Real "Sanad" being rejected → check the log line `wake REJECTED — %r` to see what Whisper heard; widen `wake_words` if needed - Commands transcribed wrong → field accuracy is mostly Gemini's job; for room-specific tuning try `gemini_vad_silence_duration_ms` (longer = more patience for hesitations)
- Commands transcribed wrong → check `whisper: lp=%.2f nsp=%.2f text=%r` log line; lower `whisper_no_speech_threshold` or tighten `whisper_log_prob_threshold` - VAD too eager / too slow → `gemini_vad_start_sensitivity` (`HIGH` / `LOW`) and `gemini_vad_end_sensitivity` (`LOW` for slow speech, `HIGH` to cut early)
- "I didn't catch that" on silence → raise `min_transcription_length` - Filler words triggering dispatch → expand `garbage_patterns`
- Latency too high → set `wake_ack: "none"` (skip "Yes" TTS, save ~1.7 s/cycle) - Robot too talkative / too terse → edit `gemini_system_prompt` (or point `gemini_system_prompt_file` at a `.txt` for richer personas)
- Session reconnects too aggressive → raise `gemini_max_consecutive_errors`
- Disable per-turn WAV saves → `gemini_record_enabled: false`
--- ---
@ -209,7 +210,7 @@ Control what initializes at boot. Defaults:
``` ```
Set any to `false` to skip that subsystem's init. Boot time drops roughly: Set any to `false` to skip that subsystem's init. Boot time drops roughly:
- `voice: false` → ~2 s faster (no Whisper model load) - `voice: false` → ~1 s faster (no Gemini WebSocket open, no mic thread)
- `lidar: false` → ~1 s faster (no SLAM subprocess spawn) - `lidar: false` → ~1 s faster (no SLAM subprocess spawn)
- `imgsearch: false` → already the default; re-enable only when you need `search/ …` - `imgsearch: false` → already the default; re-enable only when you need `search/ …`
- `autonomous: false` → minor, but removes the AutonomousMode init - `autonomous: false` → minor, but removes the AutonomousMode init
@ -244,7 +245,10 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json:
| `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` | | `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
| Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only | | Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
| `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~1015 s on first Qwen load; subsequent commands are fast | | `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~1015 s on first Qwen load; subsequent commands are fast |
| Wake word never fires | Energy burst below floor, or Whisper verify rejecting | Check `logs/voice.log` — if you see `wake REJECTED — 'X'`, add X's root variant to `config_Voice.json::stt.wake_words`. If `baseline=0` persists, your ambient exceeds the floor — raise `speech_threshold`. | | Wake word never fires | Gemini transcribed but `_has_wake_word` rejected | Check `logs/transcript.log` — if `HEARD ...` shows what Gemini heard but no `CMD ...` follows, the transcript has a misheard "Sanad" variant; add the root form to `config_Voice.json::stt.wake_words`. |
| Voice silent on boot | Missing Gemini API key | Check `logs/voice.log` for `No Gemini API key found`. Set `export MARCUS_GEMINI_API_KEY='...'` before launching `run_marcus.py`. |
| `google-genai not installed` in runner stderr | Package missing in gemini_sdk env | Activate the gemini_sdk conda env and `pip install google-genai` THERE (not in marcus). |
| `no Python 3.10+ env found for the Gemini runner` | gemini_sdk env in non-default path | Set `export MARCUS_GEMINI_PYTHON=/path/to/gemini_sdk/bin/python` or edit `stt.gemini_python_path`. |
| Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" | | Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
| `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` | | `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
| Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up | | Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |
@ -257,7 +261,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json:
|------|------| |------|------|
| Brain code | `~/Marcus/Brain/` | | Brain code | `~/Marcus/Brain/` |
| Server | `~/Marcus/Server/marcus_server.py` | | Server | `~/Marcus/Server/marcus_server.py` |
| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,wake_detector,marcus_voice}.py` | | Voice | `~/Marcus/Voice/{audio_io,builtin_mic,builtin_tts,gemini_script,turn_recorder,marcus_voice}.py` |
| Config | `~/Marcus/Config/` | | Config | `~/Marcus/Config/` |
| Prompts | `~/Marcus/Config/marcus_prompts.yaml` | | Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
| YOLO model | `~/Marcus/Models/yolov8m.pt` | | YOLO model | `~/Marcus/Models/yolov8m.pt` |

View File

@ -142,7 +142,8 @@ All 25 project modules import cleanly from the `marcus` env at `/home/unitree/Ma
``` ```
OK Core.config_loader Core.env_loader OK Core.config_loader Core.env_loader
OK Core.log_backend Core.logger OK Core.log_backend Core.logger
OK Voice.builtin_mic Voice.builtin_tts Voice.marcus_voice OK Voice.audio_io Voice.builtin_mic Voice.builtin_tts
OK Voice.gemini_script Voice.turn_recorder Voice.marcus_voice
OK Vision.marcus_yolo Vision.marcus_imgsearch OK Vision.marcus_yolo Vision.marcus_imgsearch
OK API.llava_api API.yolo_api API.camera_api OK API.llava_api API.yolo_api API.camera_api
OK API.zmq_api API.imgsearch_api API.odometry_api OK API.zmq_api API.imgsearch_api API.odometry_api
@ -384,5 +385,6 @@ Config file (`Config/config_Vision.json`):
| 2026-04-21 | **Subprocess leak fix**: `AudioAPI._record_parec` now wraps `Popen` in try/finally with `terminate → wait(1.0) → kill` fallback; orphan `parec` processes can no longer survive Ctrl-C. Last-resort `proc.kill()` catches only `OSError` (not bare `except`). | | 2026-04-21 | **Subprocess leak fix**: `AudioAPI._record_parec` now wraps `Popen` in try/finally with `terminate → wait(1.0) → kill` fallback; orphan `parec` processes can no longer survive Ctrl-C. Last-resort `proc.kill()` catches only `OSError` (not bare `except`). |
| 2026-04-21 | **Modelfile corrected**: `Models/Modelfile` now `FROM qwen2.5vl:3b` (was `:7b`) with a header explaining it's an optional build template — runtime uses `ollama pull qwen2.5vl:3b` directly. | | 2026-04-21 | **Modelfile corrected**: `Models/Modelfile` now `FROM qwen2.5vl:3b` (was `:7b`) with a header explaining it's an optional build template — runtime uses `ollama pull qwen2.5vl:3b` directly. |
| 2026-04-21 | **Final verification**: 14-dimension smoke test green — no Arabic, no dead dirs, 0 orphan keys, every FileHandler rotates, no bare `except: pass`, no stale `Models_marcus` / `marcus_llava` refs, 25/25 modules import. | | 2026-04-21 | **Final verification**: 14-dimension smoke test green — no Arabic, no dead dirs, 0 orphan keys, every FileHandler rotates, no bare `except: pass`, no stale `Models_marcus` / `marcus_llava` refs, 25/25 modules import. |
| 2026-04-24 | **Voice finalised on faster-whisper + custom energy wake**. Added `Voice/wake_detector.py` (pure-numpy energy state machine, adaptive noise floor, burst-audio capture for verify). Rewrote `Voice/marcus_voice.py` around it: three operating modes (`wake_and_command` / `always_on` / `always_on_gated`), hysteretic record VAD, pre-speech silence trim (300 ms pre-roll preserved), faster-whisper `base.en` int8 CPU decode, fuzzy-match canonicalisation against `command_vocab`, `GARBAGE_PATTERNS` + length filter for noise hallucinations, `/s-/` phonetic wake verify (accepts Whisper mishearings of "Sanad" like "Stop"/"Set"/"Sand"). Tried and reverted: Gemini Live WebSocket (Python 3.8 incompatibility + latency), Vosk grammar STT (English lexicon can't decode "Sanad"; big model cold-load too slow on Jetson). All voice tunables (33 wake_words, 68 command_vocab, 17 garbage_patterns, ~25 threshold/VAD/Whisper keys) live in `config_Voice.json::stt.*` — zero hardcoded strings in Voice/. | | 2026-04-24 | **Voice finalised on faster-whisper + custom energy wake** (later replaced — see 2026-04-25). Added `Voice/wake_detector.py` + rewrote `Voice/marcus_voice.py` around it with three modes (`wake_and_command` / `always_on` / `always_on_gated`), hysteretic record VAD, faster-whisper `base.en` int8 CPU decode. Field testing on the G1 far-field mic showed unacceptable transcription error rates regardless of tuning. |
| 2026-04-24 | **Command parser widened**: `Brain/command_parser.py` now has `_RE_SIMPLE_DIR` (`left`, `go back`, `move forward`, `step right`, etc.) and `_RE_STOP_SIMPLE` (`stop`, `halt`, `wait`, `pause`, `freeze`) regex fast-paths — these bare-direction / bare-stop commands now skip Qwen entirely (~50 ms vs ~5 s). Motion velocities and step duration pulled from `config_Navigation.json::{move_map, step_duration_sec}` via `API/zmq_api.py`; command_parser no longer contains hardcoded `0.3` / `2.0` magic numbers. | | 2026-04-24 | **Command parser widened**: `Brain/command_parser.py` now has `_RE_SIMPLE_DIR` (`left`, `go back`, `move forward`, `step right`, etc.) and `_RE_STOP_SIMPLE` (`stop`, `halt`, `wait`, `pause`, `freeze`) regex fast-paths — these bare-direction / bare-stop commands now skip Qwen entirely (~50 ms vs ~5 s). Motion velocities and step duration pulled from `config_Navigation.json::{move_map, step_duration_sec}` via `API/zmq_api.py`; command_parser no longer contains hardcoded `0.3` / `2.0` magic numbers. |
| 2026-04-25 | **Voice rewritten on Gemini Live (Sanad-pattern port)**. Replaced the wake_detector + faster-whisper + (briefly attempted) Moonshine paths with Gemini Live STT-only (`response_modalities=["TEXT"]`). New files mirror Sanad's `voice/audio_io.py` + `voice/sanad_voice.py::TurnRecorder` + `gemini/script.py` structure: **`Voice/audio_io.py`** (Mic/Speaker ABCs + BuiltinMic/BuiltinSpeaker + AudioIO.from_profile factory), **`Voice/turn_recorder.py`** (per-turn WAV saver), **`Voice/gemini_script.py`** (`GeminiBrain` STT-only, threaded asyncio adapter around Sanad's async `run()`). `Voice/builtin_mic.py` becomes a backward-compat shim. `Voice/wake_detector.py` deleted. `Voice/marcus_voice.py` shrinks 1578 L → 438 L: `_voice_loop_gemini` builds AudioIO, spawns GeminiBrain, dispatch gate (`_dispatch_gemini_command`) requires "Sanad" + fuzzy-match `command_vocab` then forwards to brain. Added `flush_mic()` hook called by `Brain/marcus_brain._on_command` around `audio_api.speak()` to prevent TtsMaker echo from being transcribed. `Config/config_Voice.json` rewritten: dropped `stt.backend`, `stt.mode`, all `whisper_*` / `moonshine_*` / wake-detector / VAD-record knobs, all barge-in keys (no Gemini audio-out anymore); kept Sanad-matching values for `mic_udp`, `speaker`, Gemini VAD/session/voice settings. Single cloud dependency: env `MARCUS_GEMINI_API_KEY` (or `SANAD_GEMINI_API_KEY` fallback) + `pip install google-genai` on Jetson. Mechanism / tunables / interruption identical to Sanad; only difference is response modality (TEXT vs AUDIO) and command-handling gate (wake-word-required vs Sanad's `trigger_enabled` master flag). |

View File

@ -49,26 +49,37 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra
--- ---
## `Voice/`mic + TTS + wake + STT ## `Voice/`audio I/O + Gemini Live STT + TtsMaker
| File | Public API | | File | Public API |
|---|---| |---|---|
| `builtin_mic.py` | `_find_g1_local_ip()` + **class `BuiltinMic`** | | `audio_io.py` | `_find_g1_local_ip()`, `_resample_int16`, `_as_int16_array`, abstract **classes `Mic`, `Speaker`**, concrete **classes `BuiltinMic`, `BuiltinSpeaker`**, **dataclass `AudioIO`** with `from_profile()` factory |
| `builtin_tts.py` | **class `BuiltinTTS`** | | `builtin_mic.py` | **class `BuiltinMic`** (subclass of `audio_io.BuiltinMic` + `read_seconds()` for `AudioAPI.record()`) |
| `wake_detector.py` | **dataclass `WakeConfig`** + **class `WakeDetector`** | | `builtin_tts.py` | **class `BuiltinTTS`** (used by `AudioAPI.speak()`) |
| `gemini_script.py` | module-level `_load_voice_cfg()`, `_audio_energy()`, **class `GeminiBrain`** |
| `turn_recorder.py` | **class `TurnRecorder`** |
| `marcus_voice.py` | module-level `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` (populated from config), helpers `_has_wake_word`, `_strip_wake_word`, `_strip_wake_word_once`, `_closest_command`, **class `VoiceModule`** | | `marcus_voice.py` | module-level `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` (populated from config), helpers `_has_wake_word`, `_strip_wake_word`, `_strip_wake_word_once`, `_closest_command`, **class `VoiceModule`** |
**`Voice.builtin_mic.BuiltinMic`** — G1 UDP multicast mic: **`Voice.audio_io.BuiltinMic`** — G1 UDP multicast mic (Sanad-pattern port):
`__init__(group, port, buf_max, read_timeout)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `read_seconds(seconds)`, `flush()`; internal `_recv_loop`. `__init__(group, port, buf_max)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `flush()`; internal `_recv_loop`.
**`Voice.audio_io.BuiltinSpeaker`** — streaming wrapper over `AudioClient.PlayStream` (built but idle in STT-only mode; TtsMaker owns the speaker):
`__init__(audio_client, app_name=None)`, `begin_stream()`, `send_chunk(pcm, source_rate)`, `wait_finish()`, `stop()`, properties `interrupted`, `total_sent_sec`. Internal `_stop_play_api()`.
**`Voice.audio_io.AudioIO`** — paired mic + speaker bundle:
`@classmethod from_profile(profile_id, *, audio_client=None) -> AudioIO`, `start()`, `stop()`. Only `"builtin"` profile supported (Anker/Hollyland USB profiles dropped).
**`Voice.builtin_tts.BuiltinTTS`** — wraps `AudioClient.TtsMaker`: **`Voice.builtin_tts.BuiltinTTS`** — wraps `AudioClient.TtsMaker`:
`__init__(audio_client, default_speaker_id=0)`, `speak(text, speaker_id=None, block=True)`. `__init__(audio_client, default_speaker_id=0)`, `speak(text, speaker_id=None, block=True)`.
**`Voice.wake_detector.WakeDetector`** — pure-numpy energy wake: **`Voice.gemini_script.GeminiBrain`** — Gemini Live STT-only brain (Sanad `gemini/script.py` port):
`__init__(cfg: WakeConfig)`, `process(pcm_bytes) -> bool`, `reset()`, `get_last_burst() -> np.ndarray | None`. Internal: `_step(window)` state-machine per 50 ms analysis window; adaptive `_baseline_buf` rolling mean of idle-silence RMS; captures triggering burst audio for post-hoc Whisper verify. `__init__(audio_io, recorder, voice_name=None, system_prompt="", *, api_key, on_transcript=None, on_command=None)`, `start()`, `stop()`, `async run()`. Internal: `_thread_main()` runs an asyncio loop in a worker thread, `_build_config(types)` returns `LiveConnectConfig(response_modalities=["TEXT"], input_audio_transcription, system_instruction)`, `_send_mic_loop(session, types)` streams 32 ms PCM chunks, `_receive_loop(session)` extracts `input_transcription.text` → callbacks + `model_turn` text → log + recorder.
**`Voice.marcus_voice.VoiceModule`** — voice orchestrator. Drives the wake detector, verifies each fire with a lightweight Whisper decode (wake-word substring OR /s-/ phonetic match), records commands with a hysteretic VAD, trims pre-speech silence, transcribes via faster-whisper, fuzzy-normalises near-misses to canonical commands, dispatches to brain. **`Voice.turn_recorder.TurnRecorder`** — per-turn WAV saver:
`__init__(audio_api, on_command=None, on_wake=None)`, `start()`, `stop()`, `is_running` property. Internal: `_get_fw()` lazy faster-whisper loader, `_read_mic_raw` / `_read_mic_gained`, `_record_command()` with adaptive VAD + pre-silence trim, `_transcribe(audio)` Whisper decode + garbage filter, `_transcribe_command(audio)` thin wrapper, `_normalize_command(text)` fuzzy-match to `COMMAND_VOCAB`, `_handle_wake()` / `_voice_loop()` / `_voice_loop_wake()` / `_voice_loop_always_on(gated)`, `_save_unk_wav(audio)` for post-mortem debugging. `__init__(enabled, out_dir, user_rate, robot_rate)`, `capture_user(pcm_bytes)`, `capture_robot(pcm_bytes)`, `add_user_text(text)`, `add_robot_text(text)`, `finish_turn() -> dict`. Internal: `_save_wav`, `_append_index`. In STT-only mode `<ts>_robot.wav` is never written (Gemini emits text, not audio).
**`Voice.marcus_voice.VoiceModule`** — voice orchestrator. Builds `AudioIO.from_profile("builtin", audio_client=ac)`, spawns `GeminiBrain` with `_on_gemini_transcript` (transcript log) and `_dispatch_gemini_command` (wake-word gate + fuzzy match → on_command callback) hooks. Forwards every "Sanad + X" transcript to Marcus's brain via the user-supplied `on_command` callback.
`__init__(audio_api, on_command=None, on_wake=None)`, `start()`, `stop()`, `flush_mic()`, `is_speaking` property. Internal: `_voice_loop` (calls `_voice_loop_gemini`), `_voice_loop_gemini` (assembles AudioIO + TurnRecorder + GeminiBrain), `_on_gemini_transcript(text)`, `_dispatch_gemini_command(text, lang)`, `_normalize_command(text)`. The `flush_mic()` hook is called by `Brain/marcus_brain._on_command` before AND after `audio_api.speak()` to prevent TtsMaker output from being transcribed back as user input.
--- ---
@ -156,9 +167,11 @@ from API.lidar_api import init_lidar, obstacle_ahead, get_slam_pose, stop_lida
from API.memory_api import init_memory, log_cmd, log_detection, place_save, place_goto from API.memory_api import init_memory, log_cmd, log_detection, place_save, place_goto
# voice pipeline # voice pipeline
from Voice.marcus_voice import VoiceModule from Voice.marcus_voice import VoiceModule
from Voice.builtin_mic import BuiltinMic from Voice.audio_io import AudioIO, BuiltinMic, BuiltinSpeaker
from Voice.builtin_tts import BuiltinTTS from Voice.builtin_tts import BuiltinTTS # used by AudioAPI.speak()
from Voice.gemini_script import GeminiBrain
from Voice.turn_recorder import TurnRecorder
# navigation # navigation
from Navigation.goal_nav import navigate_to_goal from Navigation.goal_nav import navigate_to_goal

View File

@ -44,70 +44,83 @@ Subsystem flags live in `config_Brain.json::subsystems`. Current defaults:
## Voice pipeline (when `subsystems.voice = true`) ## Voice pipeline (when `subsystems.voice = true`)
Marcus uses **Gemini Live STT-only** for the user's mic plus **G1 TtsMaker** for the brain's spoken reply. No local wake detector — Gemini's server-side VAD decides turn boundaries; the wake-word check happens at dispatch time on the transcribed text.
``` ```
G1 body mic (array) G1 body mic (array)
└─ UDP multicast 239.168.123.161:5555 ── int16 mono 16 kHz PCM └─ UDP multicast 239.168.123.161:5555 ── int16 mono 16 kHz PCM
Voice/builtin_mic.py::BuiltinMic Voice/audio_io.py::BuiltinMic
ring buffer (64 KB) + read_chunk(n) ring buffer (64 KB) + read_chunk(n) (Sanad-pattern; see audio_io.py)
Voice/wake_detector.py::WakeDetector Voice/gemini_script.py::GeminiBrain (asyncio worker thread)
pure-numpy energy state machine (SILENCE ⇄ SPEAKING) ├─ client.aio.live.connect(model="gemini-2.5-flash-native-audio-preview-12-2025",
adaptive noise floor: eff_threshold = max(speech_threshold, baseline × 3) │ config=LiveConnectConfig(
fires on 0.35-1.5 s bursts followed by 0.3 s silence → captures burst audio │ response_modalities=["TEXT"], ← STT-only
│ input_audio_transcription={},
│ realtime_input_config=AutomaticActivityDetection(
│ start_of_speech_sensitivity=HIGH,
│ end_of_speech_sensitivity=LOW,
│ prefix_padding_ms=20,
│ silence_duration_ms=200),
│ system_instruction=<transcriber-only role>))
├─ _send_mic_loop → 512-sample PCM chunks (32 ms each) → session.send_realtime_input
├─ _receive_loop → server_content.input_transcription.text → on_transcript + on_command
└─ on turn_complete → recorder.finish_turn() → "listening" log
Voice/marcus_voice.py::VoiceModule._handle_wake() Voice/marcus_voice.py::VoiceModule._dispatch_gemini_command(text, "en")
├─ 1. Whisper verify on the burst audio: ├─ 1. _has_wake_word(text)
│ text = faster-whisper(burst) │ match any of stt.wake_words variants as a whole word — else return early
│ accept if _has_wake_word(text) OR startswith(s/sh/z) ├─ 2. _strip_wake_word(text)
│ reject otherwise (cough, clap, hello, okay) → silent return │ iterative until stable, "Sanad. Sanad." → "" / "Sanad turn right" → "turn right"
├─ 2. audio_api.speak("Yes") → G1 body speaker (~1.5 s) ├─ 3. garbage / min-length filter
├─ 3. post_tts_settle_sec wait + mic flush │ skip "okay"/"thanks"/single-letter unless command_vocab matches exactly
├─ 4. _record_command() — hysteretic VAD ├─ 4. _normalize_command(stripped)
│ speech_entry_rms / silence_exit_rms (adapt from wake baseline)
│ trim leading silence (keep 300 ms pre-roll) → tight clip for Whisper
├─ 5. _transcribe(audio)
│ faster-whisper (base.en int8 CPU)
│ beam_size=5, temperature=0, initial_prompt bias toward Sanad vocab
│ GARBAGE_PATTERNS + min_transcription_length reject noise hallucinations
├─ 6. _normalize_command(text)
│ difflib fuzzy-match vs stt.command_vocab │ difflib fuzzy-match vs stt.command_vocab
│ "Turn right up" → "turn right" (canonical form) │ "Turn right up" → "turn right" (canonical form)
└─ 7. on_command(text, "en") ├─ 5. dedup vs last_gemini_canon within command_cooldown_sec
└─ 6. on_command(text, "en")
Brain/marcus_brain.py::process_command(text) Brain/marcus_brain.py::_on_command (closure inside init_voice)
├─ regex fast-path → Brain/command_parser.py::try_local_command() ├─ flush_mic() ← drop pending mic audio
│ places · odometry walk/turn · patrol · session recall · goal_nav ├─ result = process_command(text)
│ + SIMPLE_DIR ("go back", "right", "forward") · STOP_SIMPLE ("stop", "halt") │ ├─ regex fast-path → Brain/command_parser.py::try_local_command()
│ + NAT_GOAL_RE (naturalised goals like "the chair") · auto on/off │ │ places · odometry walk/turn · patrol · session recall · goal_nav
│ (~50 ms when matched — NO LLM call) │ │ + SIMPLE_DIR ("go back", "right", "forward") · STOP_SIMPLE ("stop", "halt")
└─ else → _handle_llava(text) │ │ + NAT_GOAL_RE (naturalised goals like "the chair") · auto on/off
├─ get_frame() (10×50 ms poll, no 1 s stall) │ │ (~50 ms when matched — NO LLM call)
├─ API/llava_api.py::ask(text, img) │ ├─ _TALK_PATTERNS ("what / who / where / …") → _handle_talk(cmd)
│ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120) │ │ → API/llava_api.py::ask_talk(...) → Qwen2.5-VL
│ → parse_json() → {actions, arm, speak, abort} │ └─ else → _handle_llava(text)
└─ Brain/executor.py::execute(d) │ ├─ get_frame() (10×50 ms poll, no 1 s stall)
├─ actions → MOVE_MAP[dir] → API/zmq_api.py::send_vel → Holosoma │ ├─ API/llava_api.py::ask(text, img)
├─ arm → API/arm_api.py (stub for now) │ │ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
└─ abort → gradual_stop() │ │ → parse_json() → {actions, arm, speak, abort}
│ └─ Brain/executor.py::execute(d)
result["speak"] → audio_api.speak(reply) │ ├─ actions → MOVE_MAP[dir] → API/zmq_api.py::send_vel → Holosoma
│ ├─ arm → API/arm_api.py (stub for now)
│ └─ abort → gradual_stop()
├─ audio_api.speak(result["speak"]) ← TtsMaker via G1 firmware
└─ flush_mic() ← drop the speaker's echo from mic buffer
API/audio_api.py::speak(text, lang="en") API/audio_api.py::speak(text, lang="en")
├─ mute mic (flush BuiltinMic buffer)
├─ Voice/builtin_tts.py::BuiltinTTS.speak(text) ├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
│ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only │ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only
│ time.sleep(len(text) * 0.08) │ time.sleep(len(text) * 0.08)
└─ unmute mic → back to listening └─ → back to listening
``` ```
**Config knobs** (all in `config_Voice.json::stt`): **Config knobs** (all in `config_Voice.json::stt`):
- Wake: `speech_threshold` (floor), `min_word_duration`, `max_word_duration`, `post_silence`, `wake_cooldown`, `wake_adaptive_mult`, `wake_diag_log_sec` - Gemini connection: `gemini_model`, `gemini_voice_name`, `gemini_audio_profile`, `gemini_chunk_size`, `gemini_send_sample_rate`
- Verify: `wake_verify_enabled` - Gemini VAD: `gemini_vad_start_sensitivity`, `gemini_vad_end_sensitivity`, `gemini_vad_prefix_padding_ms`, `gemini_vad_silence_duration_ms`
- Record: `speech_entry_rms`, `silence_exit_rms`, `silence_duration_sec`, `max_record_sec`, `min_record_sec`, `ambient_mult`, `ambient_cap_rms` - Gemini session lifecycle: `gemini_session_timeout_sec`, `gemini_max_reconnect_delay_sec`, `gemini_max_consecutive_errors`, `gemini_no_messages_timeout_sec`
- Whisper: `whisper_model`, `whisper_compute_type`, `whisper_beam_size`, `whisper_no_speech_threshold`, `whisper_log_prob_threshold`, `whisper_initial_prompt`, `mic_gain` - Persona: `gemini_system_prompt` (inline) or `gemini_system_prompt_file` (path)
- Vocab: `wake_words`, `command_vocab`, `garbage_patterns`, `command_vocab_cutoff`, `min_transcription_length` - Recording (debug WAVs): `gemini_record_enabled`
- Mode: `mode` (`wake_and_command` | `always_on` | `always_on_gated`), `wake_ack` (`tts`|`none`) - Mic gain: `mic_gain`
- Dispatch: `wake_words` (gate), `command_vocab` (fuzzy-match target), `garbage_patterns`, `command_vocab_cutoff`, `min_transcription_length`, `command_cooldown_sec`
- Hardware: `mic_udp.{group,port,buffer_max_bytes,read_timeout_sec}`, `speaker.{dds_interface,volume,app_name,begin_stream_pause_sec,wait_finish_margin_sec}`
**Env overrides** (highest precedence): `MARCUS_GEMINI_API_KEY` (or `SANAD_GEMINI_API_KEY` fallback), `MARCUS_GEMINI_MODEL`, `MARCUS_GEMINI_VOICE`.
--- ---
@ -188,22 +201,24 @@ Brain/command_parser.py — responds to "lidar status" queries
| Knob | Location | Effect | | Knob | Location | Effect |
|---|---|---| |---|---|---|
| `subsystems.lidar` | config_Brain.json | SLAM subprocess on/off | | `subsystems.lidar` | config_Brain.json | SLAM subprocess on/off |
| `subsystems.voice` | config_Brain.json | BuiltinMic + Whisper + TtsMaker loop on/off | | `subsystems.voice` | config_Brain.json | Gemini Live STT + dispatch + TtsMaker loop on/off |
| `subsystems.imgsearch` | config_Brain.json | image-guided search init on/off | | `subsystems.imgsearch` | config_Brain.json | image-guided search init on/off |
| `subsystems.autonomous` | config_Brain.json | auto-patrol state machine init on/off | | `subsystems.autonomous` | config_Brain.json | auto-patrol state machine init on/off |
| `num_batch`, `num_ctx` | config_Brain.json | llama.cpp compute-graph size (128 / 2048 ≈ 1.8 GiB graph — **do not raise** on 16 GB Jetson) | | `num_batch`, `num_ctx` | config_Brain.json | llama.cpp compute-graph size (128 / 2048 ≈ 1.8 GiB graph — **do not raise** on 16 GB Jetson) |
| `num_predict_main` | config_Brain.json | 120 tokens max for the main JSON reply | | `num_predict_main` | config_Brain.json | 120 tokens max for the main JSON reply |
| `yolo_device`, `yolo_half` | config_Vision.json | `cuda` / FP16 (hard-required; CPU not allowed) | | `yolo_device`, `yolo_half` | config_Vision.json | `cuda` / FP16 (hard-required; CPU not allowed) |
| `mic.backend` | config_Voice.json | `builtin_udp` (G1 array) or `pactl_parec` (Hollyland fallback) | | `mic.backend` | config_Voice.json | `builtin_udp` (G1 array — only option used by Gemini path) |
| `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast | | `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
| `mic_udp.read_timeout_sec` | config_Voice.json | `BuiltinMic.read_chunk` budget (default 0.04 s) | | `mic_udp.read_timeout_sec` | config_Voice.json | `BuiltinMic.read_chunk` budget (default 0.04 s) |
| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) | | `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) — used by `AudioAPI.speak()` for the brain's reply |
| `stt.wake_words` | config_Voice.json | 33 fuzzy variants of "Sanad" for the wake-verify substring match | | `stt.wake_words` | config_Voice.json | 33 fuzzy variants of "Sanad" — wake-word gate at dispatch time |
| `stt.command_vocab` | config_Voice.json | 68 canonical command phrases for fuzzy-normalization (`"turn right up"``"turn right"`) | | `stt.command_vocab` | config_Voice.json | 68 canonical command phrases for fuzzy-normalization (`"turn right up"``"turn right"`) |
| `stt.garbage_patterns` | config_Voice.json | 17 Whisper noise-hallucinations to reject (`"thanks for watching"`, `"okay"`, etc.) | | `stt.garbage_patterns` | config_Voice.json | 17 noise/filler phrases to reject (`"thanks for watching"`, `"okay"`, single letters) |
| `stt.speech_threshold` etc. | config_Voice.json | energy wake detector thresholds — see `Doc/controlling.md` "Voice" for the full tuning matrix | | `stt.gemini_model` | config_Voice.json | Gemini Live model id (default `gemini-2.5-flash-native-audio-preview-12-2025`); env `MARCUS_GEMINI_MODEL` wins |
| `stt.whisper_*` | config_Voice.json | faster-whisper model, compute type, beam size, confidence gates, bias prompt | | `stt.gemini_api_key` | config_Voice.json | API key fallback (env `MARCUS_GEMINI_API_KEY` or `SANAD_GEMINI_API_KEY` preferred) |
| `stt.mode` | config_Voice.json | `wake_and_command` (default) / `always_on` / `always_on_gated` | | `stt.gemini_vad_*` | config_Voice.json | server-side VAD start/end sensitivity, prefix padding, silence duration |
| `stt.gemini_session_timeout_sec` | config_Voice.json | reconnect cadence (660 s = Live API session cap) |
| `stt.gemini_record_enabled` | config_Voice.json | save `<ts>_user.wav` per turn under `Data/Voice/Recordings/gemini_turns/` |
| `timeout_ms`, `stale_threshold_s`, `reconnect_delay_s` | config_Camera.json | RealSense frame timeout, reconnect trigger, initial backoff | | `timeout_ms`, `stale_threshold_s`, `reconnect_delay_s` | config_Camera.json | RealSense frame timeout, reconnect trigger, initial backoff |
| `default_max_steps`, `step_delay_s`, `rotate_speed`, `min_steps_warmup` | config_ImageSearch.json | image-guided search rotation cadence (wired into `Vision/marcus_imgsearch.py`) | | `default_max_steps`, `step_delay_s`, `rotate_speed`, `min_steps_warmup` | config_ImageSearch.json | image-guided search rotation cadence (wired into `Vision/marcus_imgsearch.py`) |
| `default_walk_speed`, `dist_tolerance`, `angle_tolerance`, `safety_timeout_mult`, `dr_update_hz` | config_Odometry.json | precise motion control (wired into `Navigation/marcus_odometry.py`) | | `default_walk_speed`, `dist_tolerance`, `angle_tolerance`, `safety_timeout_mult`, `dr_update_hz` | config_Odometry.json | precise motion control (wired into `Navigation/marcus_odometry.py`) |
@ -211,21 +226,19 @@ Brain/command_parser.py — responds to "lidar status" queries
--- ---
## Per-command latency (estimated, post-fixes) ## Per-command latency (estimated)
| Step | Typical | Notes | | Step | Typical | Notes |
|---|---|---| |---|---|---|
| Wake-word detect | <100 ms | pure-numpy energy detector, 50 ms analysis windows | | Mic chunk → Gemini Live | ~32 ms | 512-sample PCM blob over WebSocket |
| Wake verify (first wake) | ~2000 ms | includes faster-whisper `base.en` cold load | | Gemini server-side VAD turn-end | ~200 ms | configurable via `gemini_vad_silence_duration_ms` (default 200) |
| Wake verify (subsequent) | 300700 ms | Whisper cached, decodes ~0.5-1.5 s burst | | Gemini transcript emission | 100500 ms | depends on utterance length; partials may stream |
| "Yes" TTS ack | ~1500 ms | G1 firmware `TtsMaker` minimum | | Wake-word check + fuzzy-normalize | <5 ms | `re.search` + difflib against 68 phrases |
| Record until silence | 15 s | depends on user speech; `max_record_sec=5` cap | | Dispatch dedup | <1 ms | string compare + cooldown |
| Pre-silence trim | <1 ms | numpy slice |
| faster-whisper STT | 5001500 ms | `base.en` int8 on CPU, beam_size=5 |
| Fuzzy-match canonicalisation | <1 ms | difflib against 68 phrases |
| Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall | | Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
| Ollama Qwen2.5-VL | 8001500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` | | Ollama Qwen2.5-VL | 8001500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
| Executor + ZMQ send | <10 ms | fire-and-forget PUB | | Executor + ZMQ send | <10 ms | fire-and-forget PUB |
| TtsMaker playback | ~len(text) × 80 ms | synthesizes + plays on robot | | TtsMaker playback | ~len(text) × 80 ms | synthesizes + plays on robot |
| `flush_mic` × 2 | <1 ms each | bracketed around `audio_api.speak()` |
**Total wake → answer-playback:** ~**2.54 s** for a short vision question like "what do you see" (vs. 58 s with the pre-restructure edge-tts/Gemini overhead). **Total user-stops-talking → answer-playback:** ~**1.53 s** for a short vision question like "Sanad, what do you see" — Gemini's instant turn-detection saves the 2 s "Yes" ack the previous Whisper-era pipeline needed.

View File

@ -25,15 +25,19 @@ a Python brain.
| **Brain** (reason, speak, decide) | Parse commands, reason about vision, pick actions | **Qwen2.5-VL 3B** via Ollama | Jetson GPU | | **Brain** (reason, speak, decide) | Parse commands, reason about vision, pick actions | **Qwen2.5-VL 3B** via Ollama | Jetson GPU |
| **Eyes** (see) | Real-time object/person detection | **YOLOv8m** (CUDA, FP16, 320 px, ~22 FPS) | Jetson GPU | | **Eyes** (see) | Real-time object/person detection | **YOLOv8m** (CUDA, FP16, 320 px, ~22 FPS) | Jetson GPU |
| **Eyes** (understand) | Open-ended scene understanding, reading, goal-verify | **Qwen2.5-VL** (same brain model) | Jetson GPU | | **Eyes** (understand) | Open-ended scene understanding, reading, goal-verify | **Qwen2.5-VL** (same brain model) | Jetson GPU |
| **Ears** (hear) | Energy-based wake detector + command transcription | **Custom DSP wake** (numpy, no ML) + **faster-whisper base.en int8** (STT) | Jetson CPU | | **Ears** (hear) | Mic capture + speech-to-text | G1 UDP multicast mic + **Gemini Live STT** (`gemini-2.5-flash-native-audio-preview`, `response_modalities=["TEXT"]`, server-side VAD) | Jetson → Google API |
| **Mouth** (speak) | On-robot TTS, no internet needed | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker | | **Mouth** (speak) | On-robot TTS for the brain's spoken replies | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker |
| **Legs** (walk) | 29-DoF locomotion + balance | **Holosoma** RL policy (separate process, ONNX) | Jetson CPU | | **Legs** (walk) | 29-DoF locomotion + balance | **Holosoma** RL policy (separate process, ONNX) | Jetson CPU |
| **Hands** (gesture) | Arm & hand actions | **GR00T N1.5** — pending; `API/arm_api.py` is a stub today | Jetson GPU (future) | | **Hands** (gesture) | Arm & hand actions | **GR00T N1.5** — pending; `API/arm_api.py` is a stub today | Jetson GPU (future) |
| **Inner ear** (map) | SLAM, obstacle detection, localisation | **Livox Mid-360** LiDAR + custom SLAM engine | Jetson (subprocess) | | **Inner ear** (map) | SLAM, obstacle detection, localisation | **Livox Mid-360** LiDAR + custom SLAM engine | Jetson (subprocess) |
| **Memory** | Places, session history, facts | JSON files under `Data/Brain/Sessions/` | Jetson disk | | **Memory** | Places, session history, facts | JSON files under `Data/Brain/Sessions/` | Jetson disk |
Nothing here reaches the cloud. The only internet-adjacent bits (edge-tts, Almost everything runs on-robot. The single cloud dependency is **Gemini Live**
Gemini) were removed — everything runs on the robot's own compute. for speech-to-text — chosen because the G1's far-field mic + Whisper-on-CPU
combination produced too many transcription errors during real-world testing.
Vision (YOLO + Qwen2.5-VL), reasoning, motion, navigation, memory, LiDAR — all
local on the Jetson. TTS replies still go through G1's on-board `TtsMaker`,
not Gemini.
--- ---
@ -54,7 +58,7 @@ Camera ─┘ ▼ ├─► Legs (Holosoma
Three input modalities, same command loop: Three input modalities, same command loop:
- **Voice**say "**Sanad**" → energy detector fires, Whisper verifies the /sa-/ phoneme signature, robot replies "Yes" → speak your command → faster-whisper transcribes → brain answers through the G1 speaker. - **Voice**Gemini Live streams the mic continuously and emits transcripts. When the transcript starts with "**Sanad**" plus a request, Marcus's brain handles it (motion / VLM / Q&A) and replies through the G1 speaker via TtsMaker. No local wake detector, no acoustic ack — Gemini's server-side VAD decides when you've stopped speaking.
- **Text** — type the same command into `run_marcus.py`'s terminal. - **Text** — type the same command into `run_marcus.py`'s terminal.
- **WebSocket (remote)**`Client/marcus_cli.py` or `Client/marcus_client.py` (Tkinter GUI) send commands from a workstation. - **WebSocket (remote)**`Client/marcus_cli.py` or `Client/marcus_client.py` (Tkinter GUI) send commands from a workstation.
@ -84,8 +88,8 @@ There are two schools for combining them:
| Vision — open-ended scene understanding | same VLM | learned | | Vision — open-ended scene understanding | same VLM | learned |
| Legs / locomotion | **RL policy** (Holosoma, ONNX) | learned | | Legs / locomotion | **RL policy** (Holosoma, ONNX) | learned |
| Arms / gestures | SDK action-ID lookup | **hand-coded** | | Arms / gestures | SDK action-ID lookup | **hand-coded** |
| Wake word | Custom energy-envelope DSP (numpy) | hand-coded | | Wake word gating | String match on `command_vocab` after Gemini transcribes | hand-coded |
| STT (command) | faster-whisper base.en | learned | | STT (command) | Gemini Live (`gemini-2.5-flash-native-audio-preview`) | cloud-hosted |
| TTS | Unitree `TtsMaker` (on-robot DSP) | firmware | | TTS | Unitree `TtsMaker` (on-robot DSP) | firmware |
| Glue between layers | Python + ZMQ + JSON | hand-coded | | Glue between layers | Python + ZMQ + JSON | hand-coded |
@ -144,7 +148,7 @@ Same hardware, different prompts + wake word.
- **Prompts** rewrite: *"You are a museum guide. When a visitor asks about an exhibit, describe it in two sentences and invite them to ask follow-ups."* - **Prompts** rewrite: *"You are a museum guide. When a visitor asks about an exhibit, describe it in two sentences and invite them to ask follow-ups."*
- **Places** memory pre-loaded with exhibit waypoints; `patrol: exhibit_A → exhibit_B → exit` follows a tour. - **Places** memory pre-loaded with exhibit waypoints; `patrol: exhibit_A → exhibit_B → exit` follows a tour.
- Wake word variants in `config_Voice.json::stt.wake_words` (fuzzy list, handles Whisper mishearings of "Sanad"). - Wake word variants in `config_Voice.json::stt.wake_words` (fuzzy list, handles common mishearings of "Sanad" Gemini sometimes emits).
- Image search (`search/ photo_of_exhibit.jpg`) lets visitors hold up a printed map; the robot navigates to the matching location. - Image search (`search/ photo_of_exhibit.jpg`) lets visitors hold up a printed map; the robot navigates to the matching location.
- YOLO classes trimmed to people-only if the venue doesn't need object safety. - YOLO classes trimmed to people-only if the venue doesn't need object safety.
@ -173,9 +177,10 @@ No code changes required for either deployment.
┌───────┴────────┬──────────────┬────────────┐ ┌───────┴────────┬──────────────┬────────────┐
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
Vision/ Navigation/ Voice/ Lidar/ Vision/ Navigation/ Voice/ Lidar/
YOLO, imgsearch goal_nav, builtin_mic, SLAM engine YOLO, imgsearch goal_nav, audio_io, SLAM engine
patrol, odom builtin_tts, (subprocess) patrol, odom builtin_tts, (subprocess)
wake_detector, gemini_script,
turn_recorder,
marcus_voice marcus_voice
@ -202,7 +207,20 @@ cd ~/holosoma && python3 src/holosoma_inference/.../run_policy.py ...
ollama serve > /tmp/ollama.log 2>&1 & ollama serve > /tmp/ollama.log 2>&1 &
sleep 3 sleep 3
# 3) Start Marcus # 3) Install the Gemini SDK in its own Python 3.10+ env (one-time)
# google-genai requires Python ≥3.9; marcus is pinned to 3.8 by the
# Jetson torch wheel, so Gemini runs in a sibling conda env.
conda create -n gemini_sdk python=3.10 -y
conda activate gemini_sdk
pip install google-genai numpy
conda deactivate
# 4) Provide the Gemini key (voice is the only cloud dep)
export MARCUS_GEMINI_API_KEY='<your-key>' # SANAD_GEMINI_API_KEY also accepted
# Optional: only needed if gemini_sdk env is NOT at ~/miniconda3/envs/gemini_sdk/
# export MARCUS_GEMINI_PYTHON=/path/to/gemini_sdk/bin/python
# 5) Start Marcus
conda activate marcus conda activate marcus
cd ~/Marcus cd ~/Marcus
python3 run_marcus.py python3 run_marcus.py
@ -255,7 +273,7 @@ Marcus/
├── Brain/ orchestrator, parser, executor, memory ├── Brain/ orchestrator, parser, executor, memory
├── Vision/ YOLO + image-guided search ├── Vision/ YOLO + image-guided search
├── Navigation/ goal nav, patrol, odometry ├── Navigation/ goal nav, patrol, odometry
├── Voice/ built-in mic, TTS, energy wake detector, faster-whisper STT ├── Voice/ audio I/O (mic + speaker), Gemini Live STT, TtsMaker
├── Autonomous/ exploration state machine ├── Autonomous/ exploration state machine
├── Lidar/ SLAM engine (subprocess) ├── Lidar/ SLAM engine (subprocess)
├── Server/ WebSocket interface ├── Server/ WebSocket interface
@ -283,16 +301,19 @@ Marcus/
## Design principles ## Design principles
1. **Offline-first.** No cloud dependency in the default path. Internet can be 1. **Offline-first where it matters.** Vision, reasoning, motion, navigation,
wired in for specific backends (e.g. future edge-tts) but it's opt-in. memory, LiDAR — all on the Jetson. The single cloud dependency is Gemini
Live STT (speech in only, text out — Marcus's brain still owns the reply).
It can be swapped for any other STT by reimplementing `Voice/gemini_script.py`
behind the same `start()/stop()` + `on_command(text, lang)` callback.
2. **GPU mandatory.** YOLO refuses to start on CPU — Marcus is a safety-critical 2. **GPU mandatory.** YOLO refuses to start on CPU — Marcus is a safety-critical
robot, silently downgrading to 2 FPS vision is worse than failing loudly. robot, silently downgrading to 2 FPS vision is worse than failing loudly.
3. **Swappable subsystems.** Each API file can be reimplemented behind the same 3. **Swappable subsystems.** Each API file can be reimplemented behind the same
public functions. Replace YOLO with DETR, Qwen with LLaVA, TtsMaker with public functions. Replace YOLO with DETR, Qwen with LLaVA, TtsMaker with
Piper — Brain never notices. Piper, Gemini STT with Whisper — Brain never notices.
4. **Config over code.** Tunables live in `Config/*.json` / `.yaml`; 156 config 4. **Config over code.** Tunables live in `Config/*.json` / `.yaml`; every key
keys are all actively referenced (0 orphans). Change persona, wake word, is actively referenced (0 orphans). Change persona, wake word, enabled
enabled subsystems, or thresholds without touching a `.py` file. subsystems, or thresholds without touching a `.py` file.
5. **English only.** Arabic support was removed because the G1 firmware's TTS 5. **English only.** Arabic support was removed because the G1 firmware's TTS
silently maps Arabic to Chinese. If bilingual TTS is ever needed again, silently maps Arabic to Chinese. If bilingual TTS is ever needed again,
see `git log` for the removed Piper / edge-tts paths. see `git log` for the removed Piper / edge-tts paths.

345
Voice/audio_io.py Normal file
View File

@ -0,0 +1,345 @@
"""Hardware-agnostic audio I/O for Marcus voice pipelines.
Direct port of /home/zedx/Robotics_workspace/yslootahtech/Project/Sanad/voice/audio_io.py,
with USB mic/speaker profiles (Anker/Hollyland) removed Marcus only uses the
G1 on-board profile. Class names and method signatures match Sanad verbatim so
the rest of the Gemini brain code lifts over cleanly.
Mics deliver int16 mono PCM at 16 kHz.
Speakers accept int16 mono PCM plus a `source_rate` and resample internally.
Usage:
audio = AudioIO.from_profile("builtin", audio_client=ac)
audio.start()
try:
chunk = audio.mic.read_chunk(1024)
audio.speaker.begin_stream()
audio.speaker.send_chunk(pcm_24k, 24000)
audio.speaker.wait_finish()
finally:
audio.stop()
"""
from __future__ import annotations
import json
import logging
import os
import socket
import struct
import subprocess
import sys
import threading
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any, Optional, Union
import numpy as np
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
try:
from Core.config_loader import load_config
_VCFG = load_config("Voice") or {}
except Exception:
_VCFG = {}
log = logging.getLogger("audio_io")
_MIC_CFG = _VCFG.get("mic_udp", {}) or {}
_SP_CFG = _VCFG.get("speaker", {}) or {}
TARGET_MIC_RATE = 16_000
_MCAST_GRP = _MIC_CFG.get("group", "239.168.123.161")
_MCAST_PORT = int(_MIC_CFG.get("port", 5555))
_MIC_BUF_MAX = int(_MIC_CFG.get("buffer_max_bytes", 64_000))
_MIC_READ_TIMEOUT = float(_MIC_CFG.get("read_timeout_sec", 0.04))
PCMLike = Union[bytes, bytearray, memoryview, np.ndarray]
def _find_g1_local_ip() -> str:
"""Find the host IPv4 address on the G1's internal 192.168.123.0/24 network."""
out = subprocess.run(
["ip", "-4", "-o", "addr"], capture_output=True, text=True,
).stdout
for line in out.splitlines():
for tok in line.split():
if tok.startswith("192.168.123."):
return tok.split("/")[0]
raise RuntimeError("no 192.168.123.x interface found")
def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray:
if src_rate == dst_rate or pcm.size == 0:
return pcm.astype(np.int16, copy=False)
target_len = max(1, int(len(pcm) * dst_rate / src_rate))
return np.interp(
np.linspace(0, len(pcm), target_len, endpoint=False),
np.arange(len(pcm)),
pcm.astype(np.float64),
).astype(np.int16)
def _as_int16_array(pcm: PCMLike) -> np.ndarray:
if isinstance(pcm, np.ndarray):
return pcm.astype(np.int16, copy=False)
return np.frombuffer(bytes(pcm), dtype=np.int16)
# ─── Protocols ────────────────────────────────────────────
class Mic(ABC):
sample_rate: int = TARGET_MIC_RATE
@abstractmethod
def start(self) -> None: ...
@abstractmethod
def read_chunk(self, num_bytes: int) -> bytes: ...
@abstractmethod
def flush(self) -> None: ...
@abstractmethod
def stop(self) -> None: ...
class Speaker(ABC):
@abstractmethod
def begin_stream(self) -> None: ...
@abstractmethod
def send_chunk(self, pcm: PCMLike, source_rate: int) -> None:
"""Queue PCM for playback. `source_rate` is the sample rate of `pcm`."""
@abstractmethod
def wait_finish(self) -> None: ...
@abstractmethod
def stop(self) -> None: ...
@property
@abstractmethod
def interrupted(self) -> bool: ...
@property
def total_sent_sec(self) -> float:
return 0.0
# ─── G1 built-in (UDP mic + AudioClient speaker) ──────────
class BuiltinMic(Mic):
"""G1 robot's on-board mic published over UDP multicast."""
sample_rate = TARGET_MIC_RATE
def __init__(self, group: str = _MCAST_GRP, port: int = _MCAST_PORT,
buf_max: int = _MIC_BUF_MAX):
self._group = group
self._port = port
self._buf_max = buf_max
self._sock = None # type: Optional[socket.socket]
self._buf = bytearray()
self._lock = threading.Lock()
self._running = False
self._thread = None # type: Optional[threading.Thread]
def start(self) -> None:
if self._running:
return
local_ip = _find_g1_local_ip()
self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self._sock.bind(("", self._port))
mreq = struct.pack(
"4s4s",
socket.inet_aton(self._group),
socket.inet_aton(local_ip),
)
self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
self._sock.settimeout(1.0)
self._running = True
self._thread = threading.Thread(target=self._recv_loop, daemon=True)
self._thread.start()
log.info("BuiltinMic joined %s:%d on %s", self._group, self._port, local_ip)
def _recv_loop(self) -> None:
while self._running:
try:
data, _ = self._sock.recvfrom(4096)
with self._lock:
self._buf.extend(data)
if len(self._buf) > self._buf_max:
del self._buf[:len(self._buf) - self._buf_max]
except socket.timeout:
continue
except Exception:
if self._running:
time.sleep(0.01)
def read_chunk(self, num_bytes: int) -> bytes:
deadline = time.time() + _MIC_READ_TIMEOUT
while time.time() < deadline:
with self._lock:
if len(self._buf) >= num_bytes:
chunk = bytes(self._buf[:num_bytes])
del self._buf[:num_bytes]
return chunk
time.sleep(0.003)
with self._lock:
avail = len(self._buf)
if avail > 0:
chunk = bytes(self._buf[:avail])
del self._buf[:avail]
return chunk + b"\x00" * (num_bytes - avail)
return b"\x00" * num_bytes
def flush(self) -> None:
with self._lock:
self._buf.clear()
def stop(self) -> None:
self._running = False
if self._sock is not None:
try:
self._sock.close()
except Exception:
pass
self._sock = None
class BuiltinSpeaker(Speaker):
"""G1 robot's built-in speaker via AudioClient.PlayStream (16 kHz mono)."""
HARDWARE_RATE = 16_000
def __init__(self, audio_client: Any, app_name: Optional[str] = None):
self._ac = audio_client
try:
self._ac.SetVolume(100)
except Exception:
log.warning("BuiltinSpeaker.SetVolume failed")
self._app_name = app_name or _SP_CFG.get("app_name", "marcus")
self._begin_pause = float(_SP_CFG.get("begin_stream_pause_sec", 0.15))
self._finish_margin = float(_SP_CFG.get("wait_finish_margin_sec", 0.3))
self._stop_flag = threading.Event()
self._stream_id = None # type: Optional[str]
self._total_sent = 0.0
self._play_start = 0.0
def _stop_play_api(self) -> None:
try:
from unitree_sdk2py.g1.audio.g1_audio_api import (
ROBOT_API_ID_AUDIO_STOP_PLAY,
)
self._ac._Call(
ROBOT_API_ID_AUDIO_STOP_PLAY,
json.dumps({"app_name": self._app_name}),
)
except Exception:
log.warning("BuiltinSpeaker AUDIO_STOP_PLAY failed")
def begin_stream(self) -> None:
self._stop_flag.clear()
self._stop_play_api()
time.sleep(self._begin_pause)
self._stream_id = "s_{}".format(int(time.time() * 1000))
self._total_sent = 0.0
self._play_start = time.time()
def send_chunk(self, pcm: PCMLike, source_rate: int) -> None:
if self._stop_flag.is_set():
return
arr = _as_int16_array(pcm)
if arr.size < 10:
return
hw = _resample_int16(arr, int(source_rate), self.HARDWARE_RATE)
self._ac.PlayStream(self._app_name, self._stream_id, hw.tobytes())
self._total_sent += len(hw) / float(self.HARDWARE_RATE)
def wait_finish(self) -> None:
elapsed = time.time() - self._play_start
remaining = self._total_sent - elapsed + self._finish_margin
waited = 0.0
while waited < remaining and not self._stop_flag.is_set():
time.sleep(0.1)
waited += 0.1
self._stop_play_api()
def stop(self) -> None:
self._stop_flag.set()
self._stop_play_api()
@property
def interrupted(self) -> bool:
return self._stop_flag.is_set()
@property
def total_sent_sec(self) -> float:
return self._total_sent
# ─── AudioIO factory ──────────────────────────────────────
_PROFILE_ALIASES = {
"builtin": "builtin",
"g1": "builtin",
"g1_builtin": "builtin",
}
SUPPORTED_PROFILES = ("builtin",)
@dataclass
class AudioIO:
mic: Mic
speaker: Speaker
profile_id: str = field(default="builtin")
def start(self) -> None:
self.mic.start()
def stop(self) -> None:
try:
self.speaker.stop()
except Exception:
log.warning("AudioIO speaker.stop failed", exc_info=True)
try:
self.mic.stop()
except Exception:
log.warning("AudioIO mic.stop failed", exc_info=True)
@classmethod
def from_profile(
cls,
profile_id: str,
*,
audio_client: Optional[Any] = None,
) -> "AudioIO":
"""Build an AudioIO for the requested profile.
`audio_client` is the initialised `unitree_sdk2py` `AudioClient` and
is required for the `builtin` profile (the G1 on-board speaker).
"""
raw = (profile_id or "").strip().lower()
resolved = _PROFILE_ALIASES.get(raw)
if resolved is None:
raise ValueError(
"unknown audio profile {!r}; supported: {}".format(
profile_id, ", ".join(SUPPORTED_PROFILES),
)
)
if resolved == "builtin":
if audio_client is None:
raise ValueError(
"profile 'builtin' requires audio_client (G1 AudioClient)"
)
return cls(
mic=BuiltinMic(),
speaker=BuiltinSpeaker(audio_client),
profile_id=resolved,
)
raise AssertionError("unhandled resolved profile: {!r}".format(resolved))

View File

@ -1,204 +1,47 @@
""" """
builtin_mic.py G1 built-in microphone (UDP multicast capture) builtin_mic.py backward-compat shim.
================================================================
The G1 humanoid's on-board microphone is published by the Unitree firmware
as an RTP-like UDP multicast stream on 239.168.123.161:5555, carrying
16 kHz mono int16 PCM. Any host on the robot's 192.168.123.0/24 network
can join the group and read the audio no extra SDK call required.
This module intentionally has no dependency on pyaudio, pulseaudio, or the The G1 on-board microphone implementation now lives in
unitree_sdk2py package. Joining the multicast group is all that's needed. [Voice/audio_io.py](Voice/audio_io.py) where it can be paired with the
matching BuiltinSpeaker via `AudioIO.from_profile("builtin", ...)`
the same structure Sanad uses.
Usage: This module exists so existing imports (`from Voice.builtin_mic import
from Voice.builtin_mic import BuiltinMic BuiltinMic`) keep working for the non-Gemini voice paths and for
mic = BuiltinMic() `API/audio_api.py`. It subclasses the canonical `BuiltinMic` and adds
mic.start() `read_seconds()`, which is used by `API/audio_api.record()`.
try:
chunk = mic.read_chunk(1024) # 512 samples, 32 ms at 16 kHz
...
finally:
mic.stop()
Ported from Project/Sanad/voice/audio_io.py (Sanad's production implementation). Do not add new code here extend Voice/audio_io.py instead.
""" """
from __future__ import annotations from __future__ import annotations
import os
import socket
import struct
import subprocess
import sys
import threading
import time import time
from typing import Optional
# Load defaults from Config/config_Voice.json::mic_udp so they can be tuned from Voice.audio_io import BuiltinMic as _BaseBuiltinMic
# without editing code. Falls back to the hardcoded literals below if the
# config isn't reachable (e.g., when imported from a test harness).
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
try:
from Core.config_loader import load_config
_mic_udp = (load_config("Voice") or {}).get("mic_udp", {}) or {}
except Exception:
_mic_udp = {}
DEFAULT_GROUP = str(_mic_udp.get("group", "239.168.123.161"))
DEFAULT_PORT = int(_mic_udp.get("port", 5555))
DEFAULT_BUF_MAX = int(_mic_udp.get("buffer_max_bytes", 64_000)) # ~2 s of 16 kHz mono int16
DEFAULT_READ_TIMEOUT = float(_mic_udp.get("read_timeout_sec", 0.04)) # budget per read_chunk call
SAMPLE_RATE = 16_000 # hardware rate — do not change
def _find_g1_local_ip() -> str: class BuiltinMic(_BaseBuiltinMic):
""" """G1 on-board mic + `read_seconds()` convenience."""
Return the host IPv4 on the G1's internal 192.168.123.0/24 network.
Required by IP_ADD_MEMBERSHIP so the kernel knows which NIC to join on.
"""
out = subprocess.run(
["ip", "-4", "-o", "addr"], capture_output=True, text=True,
).stdout
for line in out.splitlines():
for tok in line.split():
if tok.startswith("192.168.123."):
return tok.split("/")[0]
raise RuntimeError(
"BuiltinMic: no interface on 192.168.123.0/24 — "
"host is not on the G1's internal network"
)
class BuiltinMic:
"""
G1 on-board microphone over UDP multicast.
Thread-safe: a background daemon thread receives datagrams into an
internal ring buffer; `read_chunk(n)` pulls the next `n` bytes or
blocks up to `read_timeout` before returning zeros.
"""
sample_rate = SAMPLE_RATE
def __init__(
self,
group: str = DEFAULT_GROUP,
port: int = DEFAULT_PORT,
buf_max: int = DEFAULT_BUF_MAX,
read_timeout: float = DEFAULT_READ_TIMEOUT,
):
self._group = group
self._port = port
self._buf_max = buf_max
self._read_timeout = read_timeout
self._sock: Optional[socket.socket] = None
self._buf = bytearray()
self._lock = threading.Lock()
self._running = False
self._thread: Optional[threading.Thread] = None
def start(self) -> None:
if self._running:
return
local_ip = _find_g1_local_ip()
self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self._sock.bind(("", self._port))
mreq = struct.pack(
"4s4s",
socket.inet_aton(self._group),
socket.inet_aton(local_ip),
)
self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq)
self._sock.settimeout(1.0)
self._running = True
self._thread = threading.Thread(
target=self._recv_loop, daemon=True, name="builtin_mic_rx",
)
self._thread.start()
print(f" [BuiltinMic] joined {self._group}:{self._port} on {local_ip}")
def _recv_loop(self) -> None:
while self._running:
try:
data, _ = self._sock.recvfrom(4096)
with self._lock:
self._buf.extend(data)
# ring-buffer: drop oldest when we'd exceed buf_max
if len(self._buf) > self._buf_max:
del self._buf[: len(self._buf) - self._buf_max]
except socket.timeout:
continue
except Exception:
if self._running:
time.sleep(0.01)
def read_chunk(self, num_bytes: int) -> bytes:
"""
Return exactly `num_bytes` of 16 kHz mono int16 PCM.
Waits up to `read_timeout` for that many bytes to be available.
If the buffer is still short after the timeout, returns whatever
is available padded with silence. Never blocks forever.
"""
deadline = time.time() + self._read_timeout
while time.time() < deadline:
with self._lock:
if len(self._buf) >= num_bytes:
chunk = bytes(self._buf[:num_bytes])
del self._buf[:num_bytes]
return chunk
time.sleep(0.003)
with self._lock:
avail = len(self._buf)
if avail > 0:
chunk = bytes(self._buf[:avail])
del self._buf[:avail]
return chunk + b"\x00" * (num_bytes - avail)
return b"\x00" * num_bytes
def read_seconds(self, seconds: float) -> bytes: def read_seconds(self, seconds: float) -> bytes:
""" """Capture `seconds` of audio and return as bytes."""
Convenience: capture `seconds` of audio and return as bytes. num_bytes = int(seconds * self.sample_rate * 2)
Blocks for the full duration (not a real-time producer).
"""
num_bytes = int(seconds * self.sample_rate * 2) # 2 bytes/sample (int16)
out = bytearray() out = bytearray()
chunk_bytes = 1024 chunk_bytes = 1024
while len(out) < num_bytes: while len(out) < num_bytes:
out.extend(self.read_chunk(min(chunk_bytes, num_bytes - len(out)))) out.extend(self.read_chunk(min(chunk_bytes, num_bytes - len(out))))
return bytes(out) return bytes(out)
def flush(self) -> None:
"""Drop all buffered audio (e.g. after the robot spoke)."""
with self._lock:
self._buf.clear()
def stop(self) -> None:
self._running = False
if self._sock is not None:
try:
self._sock.close()
except Exception:
pass
self._sock = None
if self._thread is not None:
self._thread.join(timeout=1.5)
self._thread = None
# ────────────────────────────────────────────────────────────────
# Standalone test — capture 3 s and print energy stats # Standalone test — capture 3 s and print energy stats
# ────────────────────────────────────────────────────────────────
if __name__ == "__main__": if __name__ == "__main__":
import array import array
print("BuiltinMic standalone test — capturing 3 s from G1...") print("BuiltinMic standalone test — capturing 3 s from G1...")
mic = BuiltinMic() mic = BuiltinMic()
mic.start() mic.start()
time.sleep(0.3) # let the receiver thread warm up time.sleep(0.3)
raw = mic.read_seconds(3.0) raw = mic.read_seconds(3.0)
mic.stop() mic.stop()
@ -212,4 +55,4 @@ if __name__ == "__main__":
if mean_abs > 30: if mean_abs > 30:
print(" OK — mic is capturing audio") print(" OK — mic is capturing audio")
else: else:
print(" WARN — signal very low, check G1 audio service is running") print(" WARN — signal very low")

440
Voice/gemini_runner.py Normal file
View File

@ -0,0 +1,440 @@
#!/usr/bin/env python3
"""Voice/gemini_runner.py — Gemini Live STT subprocess.
Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so it
can import `google-genai`, which doesn't support Python 3.8. The marcus env
itself is pinned to Python 3.8 by the NVIDIA Jetson torch wheel, so Gemini
has to live in its own process the same pattern Sanad uses.
The marcus parent process spawns this script via:
/path/to/gemini_sdk/python -u Voice/gemini_runner.py
and parses the JSON-lines stream we emit on stdout. The parent never sees
audio bytes this script owns the mic, the Gemini WebSocket, AND the WAV
recording, so the IPC boundary stays narrow (just transcripts).
Stdout protocol (one JSON object per line, UTF-8):
{"type":"ready"} session connected, mic is live
{"type":"user", "text":"..."} user input transcription
{"type":"bot", "text":"..."} Gemini's text reply (logged only — never spoken)
{"type":"turn_end"} Gemini emitted turn_complete
{"type":"reconnect", "reason":"..."} session ended, will reconnect
{"type":"log", "level":"info|warn|error", "msg":"..."}
Stdin protocol (line-based):
"stop\n" request graceful shutdown
Exit codes:
0 clean shutdown after "stop" or signal
2 google-genai not importable
3 no API key
4 fatal session loop crash
Env vars:
MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY) required
MARCUS_GEMINI_MODEL (optional) model id
MARCUS_GEMINI_VOICE (optional, ignored in TEXT mode)
MARCUS_PROJECT_ROOT (optional) for sys.path
This file uses Python 3.10+ syntax type unions with `|`, etc. because
the gemini_sdk env is 3.10+. DO NOT try to import it from marcus 3.8.
"""
from __future__ import annotations
import asyncio
import json
import os
import signal
import sys
import threading
import time
from typing import Any
import numpy as np
# Make the Marcus project importable so we can reuse Voice/audio_io.py and
# Voice/turn_recorder.py (both pure-stdlib + numpy, no Python-version traps).
_PROJECT_ROOT = (
os.environ.get("MARCUS_PROJECT_ROOT")
or os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
if _PROJECT_ROOT not in sys.path:
sys.path.insert(0, _PROJECT_ROOT)
from Voice.audio_io import BuiltinMic
from Voice.turn_recorder import TurnRecorder
try:
from Core.config_loader import load_config
_VCFG = load_config("Voice") or {}
except Exception:
_VCFG = {}
_STT = _VCFG.get("stt", {})
# ─── stdout / stderr helpers ──────────────────────────────────────
_stdout_lock = threading.Lock()
def emit(payload: dict) -> None:
"""Write one JSON line to stdout. Thread-safe + flushed."""
line = json.dumps(payload, ensure_ascii=False, separators=(",", ":"))
with _stdout_lock:
sys.stdout.write(line + "\n")
sys.stdout.flush()
def log(level: str, msg: str) -> None:
"""Send a log line to the parent (parent forwards to logs/voice.log)."""
emit({"type": "log", "level": level, "msg": msg})
# ─── stdin watcher (graceful shutdown) ────────────────────────────
_STOP_REQUESTED = threading.Event()
_MIC_HOLDER: list = [] # length-≤1 list — holds the active BuiltinMic
def _stdin_watcher() -> None:
try:
for line in sys.stdin:
cmd = line.strip().lower()
if cmd == "stop":
log("info", "stop received from parent — exiting")
_STOP_REQUESTED.set()
return
elif cmd == "flush":
# Parent asks us to drop buffered mic audio (e.g. before
# TtsMaker plays a reply, so the robot's own voice doesn't
# come back as a fake user utterance).
if _MIC_HOLDER:
try:
_MIC_HOLDER[0].flush()
except Exception:
pass
except Exception:
return
threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start()
def _install_signal_handlers() -> None:
def _handle(_signum, _frame):
log("info", "signal received — exiting")
_STOP_REQUESTED.set()
for sig in (signal.SIGTERM, signal.SIGINT):
try:
signal.signal(sig, _handle)
except Exception:
pass
# ─── tunables (mirrors Voice/gemini_script.py reads) ──────────────
_MODEL = os.environ.get(
"MARCUS_GEMINI_MODEL",
_STT.get("gemini_model", "gemini-2.5-flash-native-audio-preview-12-2025"),
)
_DEFAULT_VOICE = os.environ.get(
"MARCUS_GEMINI_VOICE",
_STT.get("gemini_voice_name", "Charon"),
)
_API_KEY = (
os.environ.get("MARCUS_GEMINI_API_KEY")
or os.environ.get("SANAD_GEMINI_API_KEY")
or _STT.get("gemini_api_key", "")
)
_MIC_GAIN = float(_STT.get("mic_gain", 1.0))
_SESSION_TIMEOUT = float(_STT.get("gemini_session_timeout_sec", 660))
_MAX_RECONNECT_DELAY = float(_STT.get("gemini_max_reconnect_delay_sec", 30))
_MAX_CONSECUTIVE_ERRORS = int(_STT.get("gemini_max_consecutive_errors", 10))
_NO_MESSAGES_TIMEOUT = float(_STT.get("gemini_no_messages_timeout_sec", 30))
SEND_SAMPLE_RATE = int(_STT.get("gemini_send_sample_rate", 16000))
CHUNK_SIZE = int(_STT.get("gemini_chunk_size", 512))
_CHUNK_BYTES = CHUNK_SIZE * 2
_REC_ENABLED = bool(_STT.get("gemini_record_enabled", True))
_RECV_RATE = int(_STT.get("gemini_receive_sample_rate", 24000))
_DATA_DIR = os.path.join(
_PROJECT_ROOT,
_VCFG.get("audio", {}).get("data_dir", "Data/Voice/Recordings"),
"gemini_turns",
)
_SYS_PROMPT = _STT.get(
"gemini_system_prompt",
"Transcribe what the user says to Sanad. Stay silent.",
)
_SP_FILE = _STT.get("gemini_system_prompt_file", "")
if _SP_FILE:
_sp_path = (
_SP_FILE if os.path.isabs(_SP_FILE)
else os.path.join(_PROJECT_ROOT, _SP_FILE)
)
try:
with open(_sp_path, "r", encoding="utf-8") as f:
txt = f.read().strip()
if txt:
_SYS_PROMPT = txt
except Exception:
pass
# ─── main async loop ──────────────────────────────────────────────
def _build_config(types):
vad_start = _STT.get("gemini_vad_start_sensitivity", "START_SENSITIVITY_HIGH")
vad_end = _STT.get("gemini_vad_end_sensitivity", "END_SENSITIVITY_LOW")
prefix_ms = int(_STT.get("gemini_vad_prefix_padding_ms", 20))
silence_ms = int(_STT.get("gemini_vad_silence_duration_ms", 200))
return types.LiveConnectConfig(
response_modalities=["TEXT"],
realtime_input_config=types.RealtimeInputConfig(
automatic_activity_detection=types.AutomaticActivityDetection(
disabled=False,
start_of_speech_sensitivity=getattr(types.StartSensitivity, vad_start),
end_of_speech_sensitivity=getattr(types.EndSensitivity, vad_end),
prefix_padding_ms=prefix_ms,
silence_duration_ms=silence_ms,
),
),
input_audio_transcription=types.AudioTranscriptionConfig(),
system_instruction=types.Content(
parts=[types.Part(text=_SYS_PROMPT)],
),
)
async def _send_mic_loop(session, types_mod, mic, recorder, done: asyncio.Event) -> None:
loop = asyncio.get_event_loop()
frame_pause = CHUNK_SIZE / float(SEND_SAMPLE_RATE)
last_activity = time.time()
while not done.is_set() and not _STOP_REQUESTED.is_set():
try:
raw = await loop.run_in_executor(None, mic.read_chunk, _CHUNK_BYTES)
except Exception as e:
log("warn", f"mic read failed: {e}")
break
if not raw:
await asyncio.sleep(frame_pause)
continue
if _MIC_GAIN != 1.0:
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32)
samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16)
raw = samples.tobytes()
# Per-turn user-audio capture for the WAV recorder. We don't have
# Gemini's "is the AI speaking" flag (no audio out), so capture
# whenever we have meaningful energy.
try:
samples_view = np.frombuffer(raw, dtype=np.int16)
if samples_view.size and int(np.abs(samples_view).max()) > 250:
recorder.capture_user(raw)
except Exception:
pass
now = time.time()
if now - last_activity > 10:
log("info", f"alive (idle {now - last_activity:.0f}s)")
last_activity = now
try:
await session.send_realtime_input(
audio=types_mod.Blob(
data=raw,
mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}",
),
)
except asyncio.CancelledError:
return
except Exception as e:
log("warn", f"mic send failed: {e}")
done.set()
return
await asyncio.sleep(frame_pause)
async def _receive_loop(session, recorder, done: asyncio.Event) -> None:
last_recv = time.time()
try:
while not done.is_set() and not _STOP_REQUESTED.is_set():
async for response in session.receive():
last_recv = time.time()
if done.is_set():
break
if (hasattr(response, "go_away")
and getattr(response, "go_away", None) is not None):
emit({"type": "reconnect", "reason": "server go_away"})
done.set()
return
sc = getattr(response, "server_content", None)
if sc is None:
continue
it = getattr(sc, "input_transcription", None)
if it is not None:
text = (getattr(it, "text", "") or "").strip()
if text:
emit({"type": "user", "text": text})
try:
recorder.add_user_text(text)
except Exception:
pass
mt = getattr(sc, "model_turn", None)
if mt is not None:
for part in getattr(mt, "parts", []) or []:
txt = getattr(part, "text", None)
if txt:
txt = txt.strip()
if txt:
emit({"type": "bot", "text": txt})
try:
recorder.add_robot_text(txt)
except Exception:
pass
if getattr(sc, "turn_complete", False):
try:
recorder.finish_turn()
except Exception:
pass
emit({"type": "turn_end"})
if time.time() - last_recv > _NO_MESSAGES_TIMEOUT:
log("warn", f"no messages from Gemini for {_NO_MESSAGES_TIMEOUT:.0f}s")
break
await asyncio.sleep(0.1)
except asyncio.CancelledError:
return
except Exception as e:
log("warn", f"receive ended: {e}")
finally:
done.set()
async def main_async() -> int:
if not _API_KEY:
log("error", "no Gemini API key (set MARCUS_GEMINI_API_KEY)")
return 3
try:
from google import genai
from google.genai import types
except Exception as e:
log("error", f"google-genai not importable: {e}")
return 2
try:
client = genai.Client(api_key=_API_KEY)
except Exception as e:
log("error", f"failed to create Gemini client: {e}")
return 4
config = _build_config(types)
mic = BuiltinMic()
mic.start()
_MIC_HOLDER.append(mic) # expose to the stdin "flush" watcher
recorder = TurnRecorder(
enabled=_REC_ENABLED,
out_dir=_DATA_DIR,
user_rate=SEND_SAMPLE_RATE,
robot_rate=_RECV_RATE,
)
session_num = 0
consecutive_errors = 0
start = time.time()
rc = 0
try:
while not _STOP_REQUESTED.is_set():
session_num += 1
uptime_min = (time.time() - start) / 60
try:
log("info", f"connecting (session #{session_num}, uptime {uptime_min:.0f}m)")
async with client.aio.live.connect(model=_MODEL, config=config) as session:
emit({"type": "ready"})
consecutive_errors = 0
mic.flush()
done = asyncio.Event()
try:
await asyncio.wait_for(
asyncio.gather(
_send_mic_loop(session, types, mic, recorder, done),
_receive_loop(session, recorder, done),
),
timeout=_SESSION_TIMEOUT,
)
except asyncio.TimeoutError:
log("info", f"session timed out after {_SESSION_TIMEOUT:.0f}s")
except asyncio.CancelledError:
pass
log("info", f"session #{session_num} ended — reconnecting in 1s")
try:
mic.flush()
except Exception:
pass
if _STOP_REQUESTED.is_set():
break
await asyncio.sleep(1)
except asyncio.CancelledError:
break
except Exception as e:
consecutive_errors += 1
delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors)
log("error", f"session error #{consecutive_errors}: {e} — retry in {delay:.0f}s")
try:
await asyncio.sleep(delay)
except asyncio.CancelledError:
break
if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS:
log("warn", f"{consecutive_errors} consecutive errors — recreating client")
try:
client = genai.Client(api_key=_API_KEY)
consecutive_errors = 0
except Exception as ce:
log("error", f"client recreation failed: {ce}")
finally:
try:
mic.stop()
except Exception:
pass
return rc
def main() -> int:
_install_signal_handlers()
try:
return asyncio.run(main_async())
except KeyboardInterrupt:
return 0
except Exception as e:
log("error", f"fatal: {e}")
return 4
if __name__ == "__main__":
sys.exit(main())

299
Voice/gemini_script.py Normal file
View File

@ -0,0 +1,299 @@
"""Voice/gemini_script.py — subprocess manager for Gemini Live STT.
Runs in marcus's Python 3.8 env. The actual Gemini STT lives in
[Voice/gemini_runner.py](Voice/gemini_runner.py) which has to run in a
Python 3.10+ env (e.g. the `gemini_sdk` conda env on the Jetson) because
`google-genai` doesn't support Python 3.8.
This file spawns the runner as a subprocess, reads JSON-line transcripts
off its stdout, and turns them into the same `on_transcript` / `on_command`
callbacks the rest of marcus expects. The external API of class
`GeminiBrain` is unchanged from the previous in-process port drop-in
swap for `Voice/marcus_voice.py::_voice_loop_gemini`.
Sanad uses the same subprocess pattern (its own `live_voice_loop.py`
parses log lines from a Gemini subprocess), so this matches Sanad's
architecture not just in mechanism but in shape.
Subprocess lookup order for the Python 3.10+ binary:
1. env MARCUS_GEMINI_PYTHON (highest priority)
2. config stt.gemini_python_path
3. auto-detect try a list of common conda env paths
4. raise explicit error in voice.log
"""
from __future__ import annotations
import json
import logging
import os
import subprocess
import sys
import threading
from typing import Callable, Optional
log = logging.getLogger("gemini_brain")
# Candidate conda-env paths for the Python 3.10+ binary. Override with
# MARCUS_GEMINI_PYTHON or stt.gemini_python_path if the env lives elsewhere.
_DEFAULT_CANDIDATES = [
"~/miniconda3/envs/gemini_sdk/bin/python",
"~/anaconda3/envs/gemini_sdk/bin/python",
"~/.miniconda3/envs/gemini_sdk/bin/python",
"/opt/conda/envs/gemini_sdk/bin/python",
"~/miniconda3/envs/sanad/bin/python",
"~/anaconda3/envs/sanad/bin/python",
]
def _resolve_runner_python(stt_cfg: dict) -> str:
"""Find the Python 3.10+ binary that can import google-genai."""
explicit = (
os.environ.get("MARCUS_GEMINI_PYTHON")
or stt_cfg.get("gemini_python_path", "")
)
if explicit:
path = os.path.expanduser(explicit)
if os.path.isfile(path) and os.access(path, os.X_OK):
return path
raise FileNotFoundError(
"MARCUS_GEMINI_PYTHON / stt.gemini_python_path = "
"{!r} but that binary does not exist or is not executable".format(path)
)
for cand in _DEFAULT_CANDIDATES:
path = os.path.expanduser(cand)
if os.path.isfile(path) and os.access(path, os.X_OK):
log.info("auto-detected gemini-runner python at %s", path)
return path
raise FileNotFoundError(
"no Python 3.10+ env found for the Gemini runner. Set env "
"MARCUS_GEMINI_PYTHON to the path of a conda env's python with "
"`google-genai` installed (e.g. ~/miniconda3/envs/gemini_sdk/bin/python)."
)
class GeminiBrain:
"""Subprocess-managing wrapper around Voice/gemini_runner.py.
External API kept identical to the in-process version so callers don't
care that Gemini lives in another Python:
brain = GeminiBrain(audio_io, recorder, voice_name, system_prompt,
api_key=..., on_transcript=cb1, on_command=cb2)
brain.start()
...
brain.stop()
`audio_io` and `recorder` are accepted for API parity but unused
the subprocess owns its own mic and writes its own WAVs (one process
owning the whole audio path is simpler than streaming PCM over a pipe).
"""
def __init__(
self,
audio_io, # ignored (runner owns its own)
recorder, # ignored (runner owns its own)
voice_name=None, # forwarded via env
system_prompt="", # forwarded via env (or config)
*,
api_key: str = "",
on_transcript: Optional[Callable[[str], None]] = None,
on_command: Optional[Callable[[str, str], None]] = None,
):
self._voice_name = voice_name or ""
self._system_prompt = system_prompt or ""
self._api_key = api_key
self._on_transcript = on_transcript
self._on_command = on_command
self._proc = None # type: Optional[subprocess.Popen]
self._reader_thread = None # type: Optional[threading.Thread]
self._err_thread = None # type: Optional[threading.Thread]
self._stopping = False
# config-loaded lazily so import order doesn't matter
try:
from Core.config_loader import load_config
cfg = load_config("Voice") or {}
except Exception:
cfg = {}
self._stt = cfg.get("stt", {})
# ─── lifecycle ────────────────────────────────────────
def start(self) -> None:
if self._proc is not None and self._proc.poll() is None:
log.warning("GeminiBrain subprocess already running")
return
self._stopping = False
try:
python_bin = _resolve_runner_python(self._stt)
except FileNotFoundError as e:
log.error("%s", e)
return
runner = os.path.abspath(
os.path.join(os.path.dirname(__file__), "gemini_runner.py")
)
if not os.path.isfile(runner):
log.error("gemini_runner.py not found at %s", runner)
return
env = os.environ.copy()
if self._api_key:
env["MARCUS_GEMINI_API_KEY"] = self._api_key
if self._voice_name:
env["MARCUS_GEMINI_VOICE"] = self._voice_name
# Forward the system prompt via env so the runner doesn't have to
# re-read the JSON file (and so a trimmed inline string survives).
if self._system_prompt:
env["MARCUS_GEMINI_SYSTEM_PROMPT"] = self._system_prompt
env["MARCUS_PROJECT_ROOT"] = os.path.dirname(os.path.dirname(runner))
log.info("spawning gemini runner: %s -u %s", python_bin, runner)
try:
self._proc = subprocess.Popen(
[python_bin, "-u", runner],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=os.path.dirname(os.path.dirname(runner)),
env=env,
bufsize=1,
universal_newlines=True,
)
except Exception as e:
log.error("failed to spawn gemini runner: %s", e)
self._proc = None
return
self._reader_thread = threading.Thread(
target=self._stdout_reader, daemon=True, name="gemini-stdout",
)
self._reader_thread.start()
self._err_thread = threading.Thread(
target=self._stderr_reader, daemon=True, name="gemini-stderr",
)
self._err_thread.start()
def flush_mic(self) -> None:
"""
Tell the runner subprocess to drop its buffered mic audio.
Used before AND after the brain speaks via TtsMaker so the robot's
own voice (which the mic picks up during playback) doesn't come
back from Gemini as a fake user utterance and accidentally hit
the wake-word gate.
"""
proc = self._proc
if proc is None or proc.stdin is None:
return
try:
if not proc.stdin.closed:
proc.stdin.write("flush\n")
proc.stdin.flush()
except Exception:
pass
def stop(self) -> None:
self._stopping = True
proc = self._proc
if proc is None:
return
# Polite stop: send "stop\n" on stdin, then wait briefly, then SIGTERM.
try:
if proc.stdin and not proc.stdin.closed:
try:
proc.stdin.write("stop\n")
proc.stdin.flush()
except Exception:
pass
except Exception:
pass
try:
proc.wait(timeout=3)
except Exception:
try:
proc.terminate()
except Exception:
pass
try:
proc.wait(timeout=2)
except Exception:
try:
proc.kill()
except Exception:
pass
self._proc = None
# ─── stdout / stderr drainers ─────────────────────────
def _stdout_reader(self) -> None:
proc = self._proc
if proc is None or proc.stdout is None:
return
for line in proc.stdout:
if self._stopping:
break
line = line.strip()
if not line:
continue
try:
msg = json.loads(line)
except Exception:
# Non-JSON line — log it raw so we can debug runner crashes.
log.warning("gemini-runner stdout (non-JSON): %s", line[:200])
continue
self._handle_msg(msg)
def _stderr_reader(self) -> None:
proc = self._proc
if proc is None or proc.stderr is None:
return
for line in proc.stderr:
line = line.rstrip()
if line:
log.warning("gemini-runner stderr: %s", line[:200])
def _handle_msg(self, msg: dict) -> None:
t = msg.get("type")
if t == "user":
text = (msg.get("text") or "").strip()
if not text:
return
log.info("USER: %s", text)
if self._on_transcript is not None:
try:
self._on_transcript(text)
except Exception as e:
log.error("on_transcript failed: %s", e)
if self._on_command is not None:
try:
self._on_command(text, "en")
except Exception as e:
log.error("on_command failed: %s", e)
elif t == "bot":
txt = (msg.get("text") or "").strip()
if txt:
log.info("GEMINI: %s", txt[:120])
elif t == "turn_end":
log.info("listening")
elif t == "ready":
log.info("connected — listening for speech")
elif t == "reconnect":
log.info("server signalled reconnect: %s", msg.get("reason", ""))
elif t == "log":
level = msg.get("level", "info")
text = msg.get("msg", "")
if level == "error":
log.error("[runner] %s", text)
elif level == "warn":
log.warning("[runner] %s", text)
else:
log.info("[runner] %s", text)
else:
log.debug("gemini-runner unknown type=%r: %s", t, msg)

File diff suppressed because it is too large Load Diff

158
Voice/turn_recorder.py Normal file
View File

@ -0,0 +1,158 @@
"""Per-turn WAV recorder for voice brains.
Direct port of Project/Sanad/voice/sanad_voice.py::TurnRecorder. Saves each
conversation turn as two WAV files:
<timestamp>_user.wav mono int16 @ 16 kHz (what the mic captured)
<timestamp>_robot.wav mono int16 @ 24 kHz (what the brain spoke)
Plus an index.json that appends one entry per turn with the transcripts.
A turn starts when audio first flows through `capture_user` or
`capture_robot`, and ends on `finish_turn`. Call pattern matches Sanad
exactly: `capture_user`, `capture_robot`, `add_user_text`, `add_robot_text`,
`finish_turn`.
Disable via config: stt.gemini_record_enabled = false (the caller passes
`enabled=False`).
"""
from __future__ import annotations
import json
import logging
import os
import threading
import time
import wave
from datetime import datetime
log = logging.getLogger("turn_recorder")
class TurnRecorder:
"""Saves each turn as two WAV files: user mic + model output."""
def __init__(
self,
enabled: bool = True,
out_dir: str = "",
user_rate: int = 16000,
robot_rate: int = 24000,
):
self.enabled = bool(enabled) and bool(out_dir)
self.out_dir = out_dir
self.user_rate = int(user_rate)
self.robot_rate = int(robot_rate)
if self.enabled:
os.makedirs(self.out_dir, exist_ok=True)
self._lock = threading.Lock()
self._user_buf = []
self._robot_buf = []
self._user_text = ""
self._robot_text = ""
self._started_at = 0.0
def capture_user(self, pcm: bytes) -> None:
if not self.enabled or not pcm:
return
with self._lock:
if not self._user_buf and not self._robot_buf:
self._started_at = time.time()
self._user_buf.append(pcm)
def capture_robot(self, pcm: bytes) -> None:
if not self.enabled or not pcm:
return
with self._lock:
if not self._user_buf and not self._robot_buf:
self._started_at = time.time()
self._robot_buf.append(pcm)
def add_user_text(self, text: str) -> None:
if text and self.enabled:
with self._lock:
self._user_text = (self._user_text + " " + text).strip()
def add_robot_text(self, text: str) -> None:
if text and self.enabled:
with self._lock:
self._robot_text = (self._robot_text + " " + text).strip()
def finish_turn(self) -> dict:
if not self.enabled:
return {}
with self._lock:
user_data = b"".join(self._user_buf)
robot_data = b"".join(self._robot_buf)
user_text = self._user_text
robot_text = self._robot_text
started_at = self._started_at
self._user_buf.clear()
self._robot_buf.clear()
self._user_text = ""
self._robot_text = ""
if not user_data and not robot_data:
return {}
stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S")
entry = {
"timestamp": stamp,
"started_at": started_at,
"user_text": user_text,
"robot_text": robot_text,
}
try:
if user_data:
p = os.path.join(self.out_dir, "{}_user.wav".format(stamp))
self._save_wav(p, user_data, self.user_rate)
entry["user_wav"] = p
entry["user_duration_sec"] = round(
len(user_data) / (self.user_rate * 2), 3,
)
if robot_data:
p = os.path.join(self.out_dir, "{}_robot.wav".format(stamp))
self._save_wav(p, robot_data, self.robot_rate)
entry["robot_wav"] = p
entry["robot_duration_sec"] = round(
len(robot_data) / (self.robot_rate * 2), 3,
)
self._append_index(entry)
log.info(
"recorded turn → %s (user %.1fs, robot %.1fs)",
stamp,
entry.get("user_duration_sec", 0),
entry.get("robot_duration_sec", 0),
)
except Exception as exc:
log.warning("recording save failed: %s", exc)
return entry
@staticmethod
def _save_wav(path: str, pcm: bytes, rate: int) -> None:
with wave.open(path, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(rate)
wf.writeframes(pcm)
def _append_index(self, entry: dict) -> None:
idx_path = os.path.join(self.out_dir, "index.json")
try:
if os.path.exists(idx_path):
with open(idx_path, "r", encoding="utf-8") as f:
payload = json.load(f)
if not isinstance(payload, dict):
payload = {"records": []}
else:
payload = {"records": []}
except Exception:
payload = {"records": []}
payload.setdefault("records", []).append(entry)
payload["total_records"] = len(payload["records"])
try:
with open(idx_path, "w", encoding="utf-8") as f:
json.dump(payload, f, indent=2, ensure_ascii=False)
except Exception as exc:
log.warning("index.json write failed: %s", exc)

View File

@ -1,263 +0,0 @@
#!/usr/bin/env python3
"""
Voice/wake_detector.py custom wake-word detector (no ML, no Vosk, no Whisper).
Energy-envelope state machine. Monitors raw PCM audio and fires a wake
event when it sees a short speech burst (sized to match a single spoken
word like "Sanad") followed by a clear silence.
Why this exists:
Vosk's small English lexicon doesn't contain the word "sanad" and
substitutes arbitrary English words ("us", "of", "senate"). Whisper on
this Jetson's torch-aarch64 produces "!!!!!" garbage. Both are broken
for this specific hardware + wake word. An acoustic detector using
only numpy doesn't care what the word actually is — it detects the
*shape* of a single spoken word in the audio energy envelope.
Algorithm (state machine):
SILENCE (rms > speech_threshold)> SPEAKING
SPEAKING (rms < silence_threshold for N chunks)> ANALYZE
ANALYZE: if 0.2 s < speech_duration < 1.5 s fire WAKE
else reset to SILENCE (too short = cough, too long = sentence)
after fire COOLDOWN for 1.5 s before next detection
What it does NOT do:
- Does not identify which word was spoken (anything in the
duration range triggers)
- Does not transcribe follow-on commands (you type those at the
terminal)
- Does not protect against loud non-speech (clapping, door slam)
Usage:
from Voice.wake_detector import WakeDetector
det = WakeDetector(sample_rate=16000)
while True:
chunk = mic.read_chunk(1024) # bytes of int16 PCM
if det.process(chunk):
print("Wake!")
"""
from __future__ import annotations
import time
from dataclasses import dataclass
from typing import Optional
import numpy as np
@dataclass
class WakeConfig:
sample_rate: int = 16_000
# RMS (int16 units) FLOOR for "this chunk is speech". The effective
# threshold is max(speech_threshold, ambient_baseline * adaptive_mult)
# so this is only a minimum guarantee — the detector adapts upward
# in noisy rooms but never below this floor.
# G1 far-field mic at normal speaking distance has rms ~ 80-400 for
# quiet speech, 400-1500 for clear speech. 80 catches quiet speech;
# raise to 120-150 if fan/typing noise triggers false wakes.
speech_threshold: float = 80.0
# How long a burst of speech must last to count as a "word".
min_word_duration_s: float = 0.20
max_word_duration_s: float = 1.50
# How long of continuous silence we need to consider the word ended.
post_silence_s: float = 0.30
# Minimum gap between two consecutive wake fires. Prevents a single
# spoken word from triggering twice.
cooldown_s: float = 1.50
# RMS window size — we analyze this many ms of audio per step.
chunk_ms: int = 50
# Adaptive: how many *recent silent* chunks to average for the noise
# floor, and the multiplier applied on top. effective_threshold =
# max(speech_threshold, baseline * adaptive_mult).
adaptive_window_n: int = 50 # ~2.5 s at 50 ms chunks
adaptive_mult: float = 3.0
# Periodic diagnostic log cadence (seconds). 0 disables.
diag_log_sec: float = 3.0
class WakeDetector:
"""Streaming acoustic wake detector — no language model required."""
STATE_SILENCE = "SILENCE"
STATE_SPEAKING = "SPEAKING"
def __init__(self, cfg: Optional[WakeConfig] = None):
self.cfg = cfg or WakeConfig()
self._chunk_samples = int(self.cfg.sample_rate * self.cfg.chunk_ms / 1000)
self._min_speech = int(self.cfg.min_word_duration_s * self.cfg.sample_rate)
self._max_speech = int(self.cfg.max_word_duration_s * self.cfg.sample_rate)
self._post_silence = int(self.cfg.post_silence_s * self.cfg.sample_rate)
self._state = self.STATE_SILENCE
self._speech_start = 0 # sample index where current burst began
self._silence_run = 0 # consecutive silent samples inside SPEAKING
self._sample_cursor = 0 # running sample count since start
self._cooldown_until = 0.0 # wall-clock time after which we can fire again
# A small rolling buffer of leftover samples (when the caller's
# chunks don't align with our internal analysis window).
self._carry = np.zeros(0, dtype=np.int16)
# Audio of the most-recent wake-triggering burst. Saved when the
# detector fires so callers (marcus_voice) can run Whisper on it
# and verify the word was actually "Sanad" rather than a cough.
self._burst_samples: list = [] # accumulated during SPEAKING
self._last_burst_audio: Optional[np.ndarray] = None
# Adaptive noise floor (rolling mean of RMS during SILENCE chunks).
self._baseline_buf = [] # last N silent-window RMS values
self._baseline = 0.0 # current estimate
self._peak_since_diag = 0.0 # max rms since last diag log
self._last_diag = time.time()
# Logger is optional — if the host app set up logging, use it.
try:
import logging
self._log = logging.getLogger("wake_detector")
except Exception:
self._log = None
# ── public API ────────────────────────────────────────────────
def process(self, pcm_bytes: bytes) -> bool:
"""
Feed int16 PCM bytes. Returns True once per spoken "word"
(short speech burst followed by silence).
"""
if not pcm_bytes:
return False
incoming = np.frombuffer(pcm_bytes, dtype=np.int16)
samples = np.concatenate([self._carry, incoming]) if self._carry.size else incoming
fired = False
n = self._chunk_samples
i = 0
while i + n <= samples.size:
window = samples[i:i + n]
if self._step(window):
fired = True
# break — flush the rest on next call so we get one fire per word
i += n
break
i += n
self._sample_cursor += n
# Keep whatever didn't fit in a full window for next call.
self._carry = samples[i:].copy()
return fired
def reset(self) -> None:
"""Drop all state — call when resuming from a long pause."""
self._state = self.STATE_SILENCE
self._silence_run = 0
self._carry = np.zeros(0, dtype=np.int16)
self._burst_samples = []
def get_last_burst(self) -> Optional[np.ndarray]:
"""
Return the int16 PCM samples of the most-recent wake-triggering
burst, or None if no wake has fired yet. Used by marcus_voice to
verify the triggering word was actually 'Sanad' before proceeding.
"""
return self._last_burst_audio
# ── internal ──────────────────────────────────────────────────
def _step(self, window: np.ndarray) -> bool:
rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))
# Effective threshold = max(config floor, adaptive baseline * mult)
eff = self.cfg.speech_threshold
if self._baseline > 0:
eff = max(eff, self._baseline * self.cfg.adaptive_mult)
is_speech = rms > eff
# Track peak for diag. Log periodically so you can *see* what the
# detector is hearing — invaluable when "not hearing me" happens.
if rms > self._peak_since_diag:
self._peak_since_diag = rms
now = time.time()
if self.cfg.diag_log_sec > 0 and (now - self._last_diag) >= self.cfg.diag_log_sec:
if self._log is not None:
self._log.info(
"wake: peak=%.0f baseline=%.0f eff_threshold=%.0f state=%s",
self._peak_since_diag, self._baseline, eff, self._state,
)
self._peak_since_diag = 0.0
self._last_diag = now
if now < self._cooldown_until:
return False # silent during cooldown
if self._state == self.STATE_SILENCE:
# Learn the noise floor ONLY in silence — so speech bursts
# don't pull the baseline up and lock us out of wake.
if not is_speech:
self._baseline_buf.append(rms)
if len(self._baseline_buf) > self.cfg.adaptive_window_n:
self._baseline_buf.pop(0)
if self._baseline_buf:
self._baseline = sum(self._baseline_buf) / len(self._baseline_buf)
if is_speech:
self._state = self.STATE_SPEAKING
self._speech_start = self._sample_cursor
self._silence_run = 0
# Begin capturing the burst audio for later Whisper verify.
self._burst_samples = [window.copy()]
return False
# STATE_SPEAKING
# Accumulate every window (speech OR silence inside the burst)
# so we capture the full word + trailing quiet.
self._burst_samples.append(window.copy())
if is_speech:
self._silence_run = 0
# Abort if the burst is longer than a single word — user is
# just talking, not addressing the robot.
if self._sample_cursor - self._speech_start > self._max_speech:
self._state = self.STATE_SILENCE
self._burst_samples = []
return False
# Silent window inside SPEAKING — accumulate.
self._silence_run += window.size
if self._silence_run >= self._post_silence:
speech_len = (self._sample_cursor - self._silence_run) - self._speech_start
self._state = self.STATE_SILENCE
self._silence_run = 0
if self._min_speech <= speech_len <= self._max_speech:
# Snapshot burst audio for the caller's Whisper verify.
self._last_burst_audio = (
np.concatenate(self._burst_samples)
if self._burst_samples else None
)
self._burst_samples = []
self._cooldown_until = now + self.cfg.cooldown_s
return True
return False
# ── standalone test ─────────────────────────────────────────────
if __name__ == "__main__":
import os
import sys
_HERE = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, os.path.dirname(_HERE))
from Voice.builtin_mic import BuiltinMic
print("WakeDetector standalone test — say 'Sanad' a few times.")
print("(Ctrl-C to quit)\n")
det = WakeDetector()
mic = BuiltinMic()
mic.start()
try:
while True:
chunk = mic.read_chunk(1024)
if det.process(chunk):
print(f" [WAKE] (t={time.strftime('%H:%M:%S')})")
except KeyboardInterrupt:
pass
finally:
mic.stop()