From 211d4f52ab8601b13e2cb6149f30e7d7e953ee31 Mon Sep 17 00:00:00 2001 From: kassam Date: Mon, 27 Apr 2026 09:39:13 +0400 Subject: [PATCH] Update 2026-04-27 09:39:12 --- Brain/marcus_brain.py | 10 + Config/config_Voice.json | 122 +-- Config/config_gemini_action.json | 250 ++++++ Doc/MARCUS_API.md | 104 ++- Doc/architecture.md | 100 +-- Doc/controlling.md | 48 +- Doc/environment.md | 6 +- Doc/functions.md | 39 +- Doc/pipeline.md | 143 ++-- README.md | 57 +- Voice/audio_io.py | 345 ++++++++ Voice/builtin_mic.py | 191 +---- Voice/gemini_runner.py | 440 ++++++++++ Voice/gemini_script.py | 299 +++++++ Voice/marcus_voice.py | 1342 +++++------------------------- Voice/turn_recorder.py | 158 ++++ Voice/wake_detector.py | 263 ------ 17 files changed, 2062 insertions(+), 1855 deletions(-) create mode 100644 Config/config_gemini_action.json create mode 100644 Voice/audio_io.py create mode 100644 Voice/gemini_runner.py create mode 100644 Voice/gemini_script.py create mode 100644 Voice/turn_recorder.py delete mode 100644 Voice/wake_detector.py diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py index 8068549..8f63b65 100644 --- a/Brain/marcus_brain.py +++ b/Brain/marcus_brain.py @@ -214,7 +214,17 @@ def _init_voice(): if isinstance(result, dict): sp = (result.get("speak") or "").strip() if sp and _audio_api: + # Drop Gemini's mic buffer so the robot's own voice + # (picked up by the mic during TtsMaker playback) + # doesn't get transcribed and fed back as a new + # "user" utterance. + if _voice_module is not None: + try: _voice_module.flush_mic() + except Exception: pass _audio_api.speak(sp) + if _voice_module is not None: + try: _voice_module.flush_mic() + except Exception: pass # Redraw the Command: prompt that our print clobbered print("Command: ", end="", flush=True) diff --git a/Config/config_Voice.json b/Config/config_Voice.json index ce58843..de176ba 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -1,47 +1,49 @@ { "tts": { + "_comment": "G1 TtsMaker — used by API/audio_api.py::speak() for non-Gemini utterances from other Marcus subsystems (e.g. brain fallback announcements). Gemini owns its own voice via gemini_brain; this section does not affect the Gemini path.", "backend": "builtin_ttsmaker", "builtin_speaker_id": 2, "target_sample_rate": 16000 }, + "stt": { - "_backend_comment": "'faster_whisper' (Whisper base.en int8 on CPU) or 'moonshine' (useful-sensors Moonshine via onnxruntime). Moonshine has a different error profile — worth trying when Whisper consistently mishears commands as short hallucinations like 'Yes.', 'Bye.', 'It.'. Moonshine requires `pip install moonshine-voice` on the Jetson.", - "backend": "moonshine", - "moonshine_language": "en", - "_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.", + "_comment": "Voice pipeline: Gemini Live STT (text-mode) → Marcus brain → TtsMaker. Gemini transcribes the user's speech with server-side VAD; Marcus's brain (Brain/marcus_brain.py) decides the reply and speaks it via AudioAPI.speak → TtsMaker. No audio comes back from Gemini (response_modalities=['TEXT']). Install on Jetson: `pip install google-genai`. API key: env MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY fallback).", - "_mode_comment": "Three modes. 'always_on_gated' (default, Sanad-style) = continuously transcribe+log every utterance to logs/transcript.log. Dispatch only when wake word is in the utterance. 'Sanad ' dispatches the cmd. Bare 'Sanad' → speak 'Yes' and treat the NEXT utterance as the command (within await_command_timeout_sec). 'wake_and_command' = classic acoustic wake first, then record a separate command (no always-listen overhead). 'always_on' = transcribe + dispatch everything, no gate (chatty).", - "mode": "wake_and_command", - "await_command_timeout_sec": 10.0, + "_gemini_comment": "Gemini Live STT-only settings. The actual Gemini WebSocket runs in a SEPARATE Python 3.10+ subprocess (Voice/gemini_runner.py) because google-genai requires Python ≥3.9 and marcus is pinned to Python 3.8 by the NVIDIA Jetson torch wheel. The marcus parent process spawns `gemini_python_path -u Voice/gemini_runner.py` and parses the JSON-line transcripts on stdout. Env overrides: MARCUS_GEMINI_API_KEY / MARCUS_GEMINI_MODEL / MARCUS_GEMINI_PYTHON.", + "_gemini_python_path_comment": "Path to a Python 3.10+ binary that has `google-genai` installed (typically a separate conda env, e.g. `gemini_sdk` on this Jetson). Leave empty to auto-detect — the manager tries ~/miniconda3/envs/gemini_sdk/bin/python and a few common alternates. Override at runtime via env MARCUS_GEMINI_PYTHON.", + "gemini_python_path": "", + "gemini_api_key": "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8", + "gemini_model": "gemini-2.5-flash-native-audio-preview-12-2025", + "gemini_voice_name": "Charon", + "gemini_audio_profile": "builtin", + "gemini_chunk_size": 512, + "gemini_send_sample_rate": 16000, + "gemini_record_enabled": true, - "_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.", - "always_on_speech_entry_rms": 150.0, - "always_on_silence_exit_rms": 70.0, - "always_on_silence_duration_sec": 0.8, - "always_on_min_utterance_sec": 0.3, - "always_on_max_utterance_sec": 12.0, - "always_on_idle_log_sec": 5.0, - "always_on_ambient_mult": 1.4, - "always_on_ambient_window_chunks": 100, + "_gemini_system_prompt_comment": "Marcus brain is the authoritative reply path; Gemini is just an ear here. Keep the prompt short — it tells Gemini to transcribe, not to chat. Override by pointing gemini_system_prompt_file at a text file (relative paths resolve from PROJECT_ROOT).", + "gemini_system_prompt_file": "", + "gemini_system_prompt": "You are Sanad's ear. Your only job is to transcribe what the user says to Sanad, the humanoid robot. Do not respond conversationally. Do not speculate. Do not invent dialogue. If the user addresses Sanad, return exactly what they said. Stay completely silent in your response.", + "_gemini_vad_comment": "Gemini server-side VAD tuning. start_sensitivity/end_sensitivity accept 'START_SENSITIVITY_HIGH|LOW' and 'END_SENSITIVITY_HIGH|LOW'. HIGH start = eagerly treats any speech-like sound as turn start, LOW = more conservative. LOW end = longer patience before ending a turn, HIGH = cuts turn sooner. prefix_padding_ms preserves audio from just before speech is detected. silence_duration_ms is how long of quiet ends a turn.", + "gemini_vad_start_sensitivity": "START_SENSITIVITY_HIGH", + "gemini_vad_end_sensitivity": "END_SENSITIVITY_LOW", + "gemini_vad_prefix_padding_ms": 20, + "gemini_vad_silence_duration_ms": 200, - "whisper_model": "base.en", - "whisper_device": "cpu", - "whisper_compute_type": "int8", + "_gemini_session_comment": "Reconnect / error-handling knobs. session_timeout_sec matches Gemini Live's max session (~11 min). After max_consecutive_errors failures the client is recreated; no_messages_timeout_sec catches dead sessions that stop emitting.", + "gemini_session_timeout_sec": 660, + "gemini_max_reconnect_delay_sec": 30, + "gemini_max_consecutive_errors": 10, + "gemini_no_messages_timeout_sec": 30, - "_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.", - "mic_gain": 1.0, - "whisper_beam_size": 8, - "whisper_no_speech_threshold": 0.85, - "whisper_log_prob_threshold": -1.8, - "whisper_compression_ratio_threshold": 3.0, - "whisper_temperature_fallback": [0.0, 0.2, 0.4], - "_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.", - "_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.", - "_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.", - "whisper_initial_prompt": "Sanad. Turn left. Turn right. Move forward. Move back. Stop. Sit down. Stand up. Wave hello. Follow me. Come here. Go home. Look around. What do you see.", + "mic_gain": 1.0, - "_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.", + "_dispatch_comment": "Motion command dispatch side-channel. Marcus listens to Gemini's input_transcription; if the text contains a wake-word variant AND the remainder fuzzy-matches a canonical phrase in command_vocab at >= command_vocab_cutoff, Marcus fires on_command() in parallel to Gemini's verbal reply. Dedup on the canonical form within command_cooldown_sec prevents streaming partials from double-firing.", + "command_vocab_cutoff": 0.72, + "command_cooldown_sec": 1.5, + "min_transcription_length": 3, + + "_vocab_comment": "wake_words = variants Gemini may produce for 'Sanad' — word-boundary matched in the user transcript. command_vocab = canonical command phrases. The dispatcher fuzzy-matches the transcript (after wake-word strip) against command_vocab. garbage_patterns lists short noise phrases Gemini sometimes emits — rejected before fuzzy-match unless they happen to equal a vocab entry exactly. Edit these to add new vocabulary — NO code change required.", "wake_words": [ "sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad", "sanat", "sunnat", "sonnat", "sinnat", "sennat", @@ -52,7 +54,6 @@ "thanad", "zanad", "sa nad", "san ad", "san odd", "san add" ], - "_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.", "command_vocab": [ "what do you see", "what can you see", "look around", "come to me", "come here", "come back", "come closer", @@ -73,8 +74,6 @@ "remember this", "forget", "do it again", "repeat", "undo", "follow me", "stay here" ], - "command_vocab_cutoff": 0.72, - "_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.", "garbage_patterns": [ "thanks for watching", "thank you for watching", "thank you", "thanks", @@ -83,72 +82,41 @@ "okay", "ok", "um", "uh", "hmm", "mm", "i", "a" - ], - "min_transcription_length": 3, - - - "_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.", - "speech_threshold": 200.0, - "min_word_duration": 0.25, - "max_word_duration": 2.50, - "post_silence": 0.20, - "wake_cooldown": 1.00, - "wake_chunk_ms": 50, - "wake_adaptive_window_n": 50, - "wake_adaptive_mult": 2.0, - "wake_diag_log_sec": 3.0, - - "wake_ack": "tts", - "_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).", - - "_wake_verify_comment": "ENABLED — confirm each acoustic wake with a lightweight Whisper decode on the triggering burst BEFORE speaking 'Yes' / opening the record window. Accept if the transcription contains any wake-word variant OR starts with s/sh/z (Whisper's consistent signature for mishearing 'Sanad' as 'Stop', 'Sand', 'Set', etc.). Reject pure noise (empty whisper) and clearly non-/sa-/ speech silently. Cost: ~1-2s extra per wake for the Whisper decode. Trade-off: slower wake response, but no 'Yes' played on stray loud noises. Set false if you prefer speed over false-wake filtering.", - "wake_verify_enabled": true, - - - "_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.", - "speech_entry_rms": 400.0, - "silence_exit_rms": 200.0, - "_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.", - "silence_duration_sec": 0.6, - "max_record_sec": 5.0, - "min_record_sec": 0.4, - "ambient_probe_sec": 0.2, - "ambient_mult": 1.5, - "ambient_cap_rms": 200.0, - "_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.", - "recording_enabled": true, - "recording_keep_count": 50, - "command_cooldown_sec": 1.5, - "post_tts_settle_sec": 0.4, - "_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'." + ] }, + "mic": { + "_comment": "Used by API/audio_api.py::record() for non-Gemini capture (e.g. ad-hoc recording commands from other subsystems). Gemini reads the mic via Voice/audio_io.py BuiltinMic directly.", "backend": "builtin_udp", "source_index": "3", "format": "s16le", "rate": 16000, "channels": 1 }, + "mic_udp": { + "_comment": "G1 on-board mic multicast parameters. Consumed by Voice/audio_io.py BuiltinMic.", "group": "239.168.123.161", "port": 5555, "buffer_max_bytes": 64000, "read_timeout_sec": 0.04 }, + "speaker": { + "_comment": "G1 on-board speaker parameters. dds_interface is the robot's DDS NIC; app_name is the stream label used by AudioClient.PlayStream.", "dds_interface": "eth0", "volume": 100, - "app_name": "sanad" + "app_name": "sanad", + "begin_stream_pause_sec": 0.15, + "wait_finish_margin_sec": 0.3 }, + "audio": { "data_dir": "Data/Voice/Recordings", "log_file": "logs/voice.log" }, + "messages": { - "wake_heard": "Yes", - "no_speech": "I didn't catch that, please say it again", - "error_tts": "Speech synthesis failed", - "error_mic": "Microphone error", "ready": "Voice system ready" } } diff --git a/Config/config_gemini_action.json b/Config/config_gemini_action.json new file mode 100644 index 0000000..6e197ea --- /dev/null +++ b/Config/config_gemini_action.json @@ -0,0 +1,250 @@ +{ + "_description": "Gemini action dispatch — maps spoken phrases to canonical motion commands. Mirrors Sanad's scripts/sanad_arm.txt pattern (Project/Sanad/scripts/sanad_arm.txt) but in JSON with action groups instead of a Python-set file. When stt.backend='gemini', Voice/marcus_voice.py::_dispatch_gemini_command matches the user's transcript (after stripping 'Sanad') against 'phrases' under each action and fires on_command with the action's 'canonical' string. Edit this file to add new spoken variants WITHOUT touching code.", + "_format": "actions..phrases — array of spoken variants (lowercase, punctuation stripped). Match is whole-word, case-insensitive. One phrase hit = fire.\nactions..canonical — the string passed to self._on_command(text, 'en'). Must be a recognised command in Brain/command_parser.py.\nactions..description — human-only; dispatcher ignores it.\nNon-motion conversation ('how are you', 'who are you', 'what do you see') is NOT listed here — Gemini answers those naturally via voice. Only physical actions live in this file.", + + "settings": { + "_comment": "Dispatcher behaviour. require_wake_word=true means the transcript must contain 'Sanad' (or a fuzzy variant from stt.wake_words) before any phrase is considered — matches the current Marcus persona rule. fire_on_wake_match=true fires the action instantly on transcript; false defers until Gemini's turn_complete (robot speaks the acknowledgement first, then moves) — mirrors Sanad's fire_on_wake_match flag in voice/text_utils.maybe_trigger_arm.", + "trigger_enabled": true, + "require_wake_word": true, + "fire_on_wake_match": true, + "stream_buffer_sec": 2.0, + "dedup_window_sec": 2.0, + "repeat_suppress_sec": 0.25, + "pending_action_ttl_sec": 6.0 + }, + + "actions": { + "turn_left": { + "canonical": "turn left", + "description": "Rotate in place 90° to the left.", + "phrases": [ + "turn left", + "rotate left", + "spin left", + "go left", + "face left" + ] + }, + "turn_right": { + "canonical": "turn right", + "description": "Rotate in place 90° to the right.", + "phrases": [ + "turn right", + "rotate right", + "spin right", + "go right", + "face right" + ] + }, + "turn_around": { + "canonical": "turn around", + "description": "Rotate 180°.", + "phrases": [ + "turn around", + "turn back", + "spin around", + "about face", + "face the other way" + ] + }, + + "move_forward": { + "canonical": "move forward", + "description": "Walk forward one step interval.", + "phrases": [ + "move forward", + "go forward", + "walk forward", + "step forward", + "forward", + "keep going", + "walk ahead" + ] + }, + "move_back": { + "canonical": "move backward", + "description": "Walk backward one step interval.", + "phrases": [ + "move back", + "move backward", + "go back", + "go backward", + "walk back", + "walk backward", + "step back", + "backward", + "reverse" + ] + }, + "step_left": { + "canonical": "move left", + "description": "Sidestep left.", + "phrases": [ + "step left", + "move left", + "slide left", + "strafe left" + ] + }, + "step_right": { + "canonical": "move right", + "description": "Sidestep right.", + "phrases": [ + "step right", + "move right", + "slide right", + "strafe right" + ] + }, + + "stop": { + "canonical": "stop", + "description": "Halt current motion immediately.", + "phrases": [ + "stop", + "halt", + "wait", + "pause", + "freeze", + "hold", + "stop moving", + "stand still", + "don't move" + ] + }, + + "sit_down": { + "canonical": "sit down", + "description": "Sit down to the ground from standing.", + "phrases": [ + "sit down", + "sit", + "take a seat", + "have a seat" + ] + }, + "stand_up": { + "canonical": "stand up", + "description": "Stand up from sitting.", + "phrases": [ + "stand up", + "stand", + "get up", + "rise" + ] + }, + + "wave_hello": { + "canonical": "wave hello", + "description": "Wave with the right arm.", + "phrases": [ + "wave hello", + "wave", + "say hi", + "greet", + "wave to me", + "wave at me" + ] + }, + "raise_arm": { + "canonical": "raise arm", + "description": "Raise the right arm straight up.", + "phrases": [ + "raise arm", + "raise your arm", + "lift your arm", + "arm up", + "hand up" + ] + }, + "lower_arm": { + "canonical": "lower arm", + "description": "Return the arm to the resting position.", + "phrases": [ + "lower arm", + "lower your arm", + "drop your arm", + "arm down", + "hand down", + "rest your arm" + ] + }, + "point": { + "canonical": "point", + "description": "Point with the right arm (used after 'look at ...').", + "phrases": [ + "point", + "point at it", + "point to it", + "point there" + ] + }, + + "come_here": { + "canonical": "come here", + "description": "Approach the speaker.", + "phrases": [ + "come here", + "come to me", + "come closer", + "approach", + "get closer", + "come over here" + ] + }, + "follow_me": { + "canonical": "follow me", + "description": "Follow the speaker until told to stop.", + "phrases": [ + "follow me", + "come with me", + "walk with me" + ] + }, + "stay_here": { + "canonical": "stay here", + "description": "Stop following and hold position.", + "phrases": [ + "stay here", + "stay", + "wait here", + "hold position", + "don't follow me" + ] + }, + "go_home": { + "canonical": "go home", + "description": "Return to the home position.", + "phrases": [ + "go home", + "return home", + "head home", + "go back home" + ] + }, + + "patrol": { + "canonical": "patrol", + "description": "Start the patrol routine.", + "phrases": [ + "patrol", + "start patrol", + "begin patrol", + "patrol the area", + "walk the route" + ] + }, + + "look_around": { + "canonical": "look around", + "description": "Scan the environment (vision sweep).", + "phrases": [ + "look around", + "scan the room", + "scan around", + "survey the area", + "have a look around" + ] + } + } +} diff --git a/Doc/MARCUS_API.md b/Doc/MARCUS_API.md index 9f400db..b0eb4a8 100644 --- a/Doc/MARCUS_API.md +++ b/Doc/MARCUS_API.md @@ -29,7 +29,7 @@ | Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. | | Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. | | Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. | -| Voice stack finalised | `Voice/marcus_voice.py`, `Voice/wake_detector.py` | Custom energy wake detector (pure numpy) + Whisper verify + faster-whisper command STT + fuzzy-match to canonical commands. Vosk experiment reverted; Gemini Live reverted. Single local STT engine. | +| Voice stack — Gemini Live STT + TtsMaker hybrid | `Voice/audio_io.py`, `Voice/gemini_script.py`, `Voice/turn_recorder.py`, `Voice/marcus_voice.py` | Sanad-pattern port: `AudioIO.from_profile("builtin", audio_client=ac)` builds the G1 mic + speaker; `GeminiBrain` runs Gemini Live `response_modalities=["TEXT"]` in a worker thread; `_dispatch_gemini_command` gates each transcript on the wake word "Sanad" + fuzzy match against `command_vocab` then forwards to the brain. The brain's reply is spoken by `AudioAPI.speak()` via on-robot TtsMaker — Gemini never speaks. Earlier iterations (faster-whisper / wake_detector / Vosk / Moonshine / full S2S) all removed. Cloud dep: env `MARCUS_GEMINI_API_KEY`. | | Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. | | Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. | | Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. | @@ -766,27 +766,31 @@ SAFETY: --- -## 15. Voice API (mic + TTS + wake + STT) +## 15. Voice API (mic + Gemini Live STT + TtsMaker) -Current pipeline: G1 mic → custom energy wake detector → Whisper verify → TtsMaker "Yes" → record → faster-whisper transcribe → fuzzy-match canonical command → brain. Replaces all prior experiments (Gemini Live WebSocket, Vosk grammar, edge-tts / Piper). +Current pipeline: G1 mic → Gemini Live (`response_modalities=["TEXT"]`) → input_transcription → wake-word gate + fuzzy match → brain → on-robot TtsMaker reply. Sanad-pattern port; only cloud dependency is the Gemini API key. Replaces all prior local-STT attempts (Whisper / Moonshine / Vosk / wake_detector). The full Sanad-style speech-to-speech mode (Gemini speaks back) was tested and removed — TtsMaker as the single voice owner avoids the audio-collision class. -### Mic — `Voice.builtin_mic.BuiltinMic` +### Mic + Speaker bundle — `Voice.audio_io.AudioIO` -Captures the G1's on-board array microphone over UDP multicast. No USB mic required. 16 kHz mono int16 PCM natively; no resampling needed. +Sanad-pattern factory. `BuiltinMic` joins the G1's UDP multicast audio (16 kHz mono int16). `BuiltinSpeaker` wraps `AudioClient.PlayStream` with 24→16 kHz resampling (built but idle in STT-only mode; TtsMaker owns the speaker via a separate firmware API). ```python -from Voice.builtin_mic import BuiltinMic -mic = BuiltinMic(group="239.168.123.161", port=5555, buf_max=64_000) -mic.start() +from Voice.audio_io import AudioIO + +audio = AudioIO.from_profile("builtin", audio_client=ac) +audio.start() try: - pcm = mic.read_chunk(1024) # 512 samples, ~32 ms, int16 mono - # or - pcm = mic.read_seconds(3.0) + pcm = audio.mic.read_chunk(1024) # 512 samples, ~32 ms + audio.mic.flush() finally: - mic.stop() + audio.stop() ``` -Config under `config_Voice.json::mic_udp`. +Config under `config_Voice.json::{mic_udp, speaker}`. + +### Mic shim — `Voice.builtin_mic.BuiltinMic` + +Backward-compat shim. Subclasses `audio_io.BuiltinMic` and adds `read_seconds(s)` for `AudioAPI.record()`. Old imports of `from Voice.builtin_mic import BuiltinMic` keep working. ### TTS — `Voice.builtin_tts.BuiltinTTS` @@ -795,41 +799,62 @@ Wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker`. E ```python from Voice.builtin_tts import BuiltinTTS tts = BuiltinTTS(audio_client, default_speaker_id=0) -tts.speak("Hello, I am Sanad", block=True) # synth + play on G1 body speaker +tts.speak("Hello, I am Sanad", block=True) ``` Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly. -### Wake detection — `Voice.wake_detector.WakeDetector` +### Gemini Live STT — `Voice.gemini_script.GeminiBrain` -Pure-numpy energy state machine with adaptive noise floor. Classifies any 0.35-1.5 s speech burst as a candidate wake, captures the audio for post-hoc verification. +Direct port of Sanad's `gemini/script.py`, configured with `response_modalities=["TEXT"]` so Gemini transcribes but never speaks. Reconnect-safe: 660 s session timeout, exponential backoff cap 30 s, client recreated after 10 consecutive errors. Runs an asyncio loop inside a worker thread; sync `start()/stop()` wrappers. ```python -from Voice.wake_detector import WakeDetector, WakeConfig -cfg = WakeConfig( - sample_rate=16_000, - speech_threshold=400.0, # min RMS floor — above noise - min_word_duration_s=0.35, # filter out coughs (<0.35s) - max_word_duration_s=1.50, # filter out sentences - post_silence_s=0.30, # how long silence marks word end - cooldown_s=1.50, # min gap between fires - chunk_ms=50, # RMS analysis window - adaptive_window_n=50, # rolling mean of idle RMS - adaptive_mult=3.0, # effective = max(floor, baseline×mult) +from Voice.audio_io import AudioIO +from Voice.turn_recorder import TurnRecorder +from Voice.gemini_script import GeminiBrain + +audio = AudioIO.from_profile("builtin", audio_client=ac) +audio.start() +rec = TurnRecorder(enabled=True, out_dir="Data/Voice/Recordings/gemini_turns") + +def on_transcript(text): + print("USER:", text) + +def on_command(text, lang): + print("dispatch:", text) + +brain = GeminiBrain( + audio, rec, voice_name="Charon", + system_prompt="...transcriber-role prompt...", + api_key=os.environ["MARCUS_GEMINI_API_KEY"], + on_transcript=on_transcript, + on_command=on_command, ) -det = WakeDetector(cfg) -while True: - pcm = mic.read_chunk(1024) - if det.process(pcm): - burst = det.get_last_burst() # audio that triggered wake - break +brain.start() +# ... later ... +brain.stop() +audio.stop() ``` -Config under `config_Voice.json::stt.{speech_threshold, min_word_duration, …}`. +Config under `config_Voice.json::stt.gemini_*` — model, voice, VAD sensitivity, session lifecycle, persona, recording. + +### Per-turn recorder — `Voice.turn_recorder.TurnRecorder` + +Saves `_user.wav` per turn plus an `index.json` entry with both transcripts. In STT-only mode, no `_robot.wav` is written (Gemini emits text, not audio). + +```python +from Voice.turn_recorder import TurnRecorder +rec = TurnRecorder(enabled=True, out_dir="Data/Voice/Recordings/gemini_turns", + user_rate=16000, robot_rate=24000) +rec.capture_user(pcm_bytes) +rec.add_user_text("Sanad, turn right") +rec.add_robot_text("Turning right") # Gemini's text reply (recorded for review, not spoken) +rec.finish_turn() # → 20260425_120000_user.wav + index.json append +``` ### Voice orchestrator — `Voice.marcus_voice.VoiceModule` -Drives the full pipeline: wake detector → Whisper verify → record → transcribe → fuzzy-match → dispatch. Three operating modes (`wake_and_command`, `always_on`, `always_on_gated`) selectable via `stt.mode`. +Drives the full pipeline: builds AudioIO + TurnRecorder + GeminiBrain, gates each transcript on the wake word "Sanad", strips it, fuzzy-matches against `command_vocab`, dedups partial transcripts within `command_cooldown_sec`, then forwards the cleaned text to the user-supplied `on_command` callback. ```python from API.audio_api import AudioAPI @@ -837,17 +862,20 @@ from Voice.marcus_voice import VoiceModule def on_command(text, lang): print(f"heard: {text}") + # return or call audio_api.speak(reply); flush_mic() is automatic in marcus_brain audio = AudioAPI() voice = VoiceModule(audio, on_command=on_command) -voice.start() # background thread +voice.start() # ... later ... voice.stop() ``` -Vocabulary (`wake_words`, `command_vocab`, `garbage_patterns`) is loaded from `config_Voice.json::stt.*` at `VoiceModule.__init__`. All thresholds, Whisper params, and mode selection live in the same config — no Python edits required to tune. See `Doc/controlling.md` → "Voice" for the tuning-knobs cheat sheet. +Vocabulary (`wake_words`, `command_vocab`, `garbage_patterns`) is loaded from `config_Voice.json::stt.*` at `VoiceModule.__init__`. All Gemini tunables (model, VAD, session timeouts, persona) live in the same config — no Python edits required. See `Doc/controlling.md` → "Voice" for the tuning-knobs cheat sheet. -The brain's `_init_voice()` wires `on_command` to `process_command(text)` → `audio_api.speak(reply)`. +`flush_mic()` is a public hook that `Brain/marcus_brain._on_command` calls before AND after `audio_api.speak(reply)` so TtsMaker output isn't transcribed back into Gemini as a fake user utterance. + +The brain's `init_voice()` wires `on_command` to `process_command(text)` → `flush_mic()` → `audio_api.speak(reply)` → `flush_mic()`. ### AudioAPI — `API.audio_api.AudioAPI` diff --git a/Doc/architecture.md b/Doc/architecture.md index 24b830e..c556526 100644 --- a/Doc/architecture.md +++ b/Doc/architecture.md @@ -13,9 +13,9 @@ - **Ollama compute-graph caps** — `num_batch=128`, `num_ctx=2048` in `config_Brain.json` (otherwise llama.cpp OOMs on the 16 GB Jetson). - **`num_predict_main: 120`** (was 200) — saves ~400-600 ms per open-ended command. - **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import. -- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic. +- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — defined in `Voice/audio_io.py::BuiltinMic` (Sanad-pattern port). `Voice/builtin_mic.py` is a thin backward-compat shim used by `API/audio_api.record()`. - **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed. -- **Voice stack finalised** — custom energy wake detector (`Voice/wake_detector.py`) + faster-whisper command STT (`Voice/marcus_voice.py`). Whisper verifies each acoustic wake before acking. Gemini voice module and Vosk grammar STT both tried and removed. +- **Voice stack — Gemini Live STT + TtsMaker hybrid (subprocess split)** — `google-genai` requires Python ≥3.9 but the marcus env is pinned to Python 3.8 by the NVIDIA Jetson torch wheel, so the actual Gemini WebSocket runs in a **separate Python 3.10+ subprocess** (`Voice/gemini_runner.py`, executed under the `gemini_sdk` conda env). The marcus parent (Python 3.8) spawns it via `Voice/gemini_script.py::GeminiBrain` and parses JSON-line transcripts on stdout. `Voice/marcus_voice.py::_dispatch_gemini_command` gates each transcript on the wake word "Sanad" + fuzzy match against `stt.command_vocab`, then forwards to `Brain.marcus_brain.process_command(...)`. The brain's reply is spoken by the on-robot `TtsMaker` — Gemini never speaks. Same pattern Sanad uses (it parses log lines from a Gemini subprocess too). Earlier in-process attempts (faster-whisper / Vosk / Moonshine / Gemini Live in marcus 3.8 / full Gemini speech-to-speech) were all tried and removed. - **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages. - **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps. - **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows. @@ -66,7 +66,7 @@ Marcus/ │ ├── config_Memory.json # session/places paths │ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports │ ├── config_ImageSearch.json # search defaults -│ ├── config_Voice.json # mic, TTS, wake detector thresholds, Whisper params, wake_words/command_vocab/garbage_patterns vocab lists, VAD thresholds +│ ├── config_Voice.json # mic, TTS, Gemini Live STT params (model, VAD sensitivities, session timeouts), wake_words/command_vocab/garbage_patterns vocab lists used by the dispatch gate │ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params │ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify, 2× imgsearch) │ # Total: 12 JSON files + 1 YAML. (config_Memory.json removed 2026-04-21.) @@ -83,11 +83,14 @@ Marcus/ │ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic │ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status() │ -├── Voice/ # Mic + TTS + wake detector + faster-whisper STT -│ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555 -│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id) -│ ├── wake_detector.py # Pure-numpy energy wake detector (WakeDetector, WakeConfig) with adaptive baseline -│ └── marcus_voice.py # VoiceModule — orchestrates wake → verify → record → Whisper → dispatch +├── Voice/ # Audio I/O + Gemini Live STT (subprocess) + TtsMaker glue +│ ├── audio_io.py # Mic/Speaker ABCs + BuiltinMic (UDP multicast) + BuiltinSpeaker (PlayStream) + AudioIO.from_profile (Sanad pattern) +│ ├── builtin_mic.py # Backward-compat shim — subclasses audio_io.BuiltinMic + adds read_seconds() for AudioAPI.record() +│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id) (used by AudioAPI.speak) +│ ├── gemini_runner.py # Subprocess script (Python 3.10+, gemini_sdk env) — opens Gemini Live, owns mic + WAV recorder, emits JSON-line transcripts on stdout +│ ├── gemini_script.py # GeminiBrain — subprocess MANAGER (Python 3.8). Spawns gemini_runner.py, reads stdout, fires on_transcript / on_command. Provides flush_mic() over stdin. +│ ├── turn_recorder.py # TurnRecorder — used by the runner to save _user.wav + index.json +│ └── marcus_voice.py # VoiceModule — spawns GeminiBrain, runs the wake-word dispatch gate │ ├── Brain/ # Decision logic — imports ONLY from API/ │ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal() @@ -186,13 +189,14 @@ Marcus/ │ wraps │ wraps ┌──────────────▼───────────┐ ┌────────▼────────────────┐ │ Navigation / Vision │ │ Voice │ -│ goal_nav.py │ │ builtin_mic.py │ -│ patrol.py │ │ builtin_tts.py │ -│ marcus_odometry.py │ │ marcus_voice.py │ -│ marcus_yolo.py │ │ wake_detector.py │ -│ │ │ (Whisper + TtsMaker) │ -│ marcus_imgsearch.py │ └──────────┬──────────────┘ -└──────────────┬───────────┘ │ +│ goal_nav.py │ │ audio_io.py │ +│ patrol.py │ │ gemini_script.py │ +│ marcus_odometry.py │ │ turn_recorder.py │ +│ marcus_yolo.py │ │ marcus_voice.py │ +│ │ │ builtin_tts.py │ +│ marcus_imgsearch.py │ │ (Gemini STT + TtsMaker)│ +└──────────────┬───────────┘ └──────────┬──────────────┘ + │ │ │ │ ┌──────────────▼─────────────────────────▼────────────┐ │ Core Layer │ @@ -489,53 +493,55 @@ Supports text-only search (no reference image) using hint description. ### Voice/ -Mic, TTS, energy-based wake detector, and faster-whisper STT pipeline. All files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable). +Audio I/O + Gemini Live STT + TtsMaker glue. All files run only when `config_Brain.json::subsystems.voice == true`. The voice path is the **single cloud dependency** in Marcus — Gemini Live transcribes the user's mic; everything else (TTS, brain, vision, motion) stays on the Jetson. TTS is English-only by design (the G1 firmware silently maps non-English to Chinese). -#### `builtin_mic.py` (~180 lines) -Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer. Identical pattern to `Project/Sanad/voice/audio_io.py::BuiltinMic`. +The Voice/ layout mirrors `Project/Sanad/voice/` (Mic/Speaker/AudioIO factory + TurnRecorder + GeminiBrain) — class names and method signatures match Sanad verbatim. Only the brain configuration differs: Marcus uses `response_modalities=["TEXT"]` (STT-only) while Sanad uses `["AUDIO"]` (full speech-to-speech). -**Exports:** -- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent) -- `start()` / `stop()` — socket lifecycle -- `read_chunk(n)` — pull exactly `n` bytes (blocks up to `read_timeout`, pads silence otherwise) -- `read_seconds(s)` — convenience for "record `s` seconds" -- `flush()` — drop buffered audio (called while TTS plays, to avoid echo) +#### `audio_io.py` (~345 lines) +Sanad-pattern hardware abstraction. Defines `Mic` and `Speaker` ABCs, the G1-specific `BuiltinMic` (UDP multicast subscriber, `239.168.123.161:5555`, 32 ms chunks, thread-safe ring buffer), `BuiltinSpeaker` (streaming wrapper around `AudioClient.PlayStream` with 24→16 kHz resample), and the `AudioIO.from_profile("builtin", audio_client=ac)` factory. `BuiltinSpeaker` is built in STT-only mode but never driven — TtsMaker owns the speaker via a separate G1 firmware API. -#### `builtin_tts.py` (~70 lines) -Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input. +**Exports:** `Mic`, `Speaker`, `BuiltinMic`, `BuiltinSpeaker`, `AudioIO`, `_resample_int16`, `_as_int16_array`. -**Exports:** -- `BuiltinTTS(audio_client, default_speaker_id=0)` — init -- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker +#### `builtin_mic.py` (~58 lines) +Backward-compat shim. Subclasses `audio_io.BuiltinMic` and adds `read_seconds(s)` for `API/audio_api.record()`. Old imports of `from Voice.builtin_mic import BuiltinMic` keep working. New code should import `audio_io.BuiltinMic` directly. -#### `wake_detector.py` (~240 lines) -Pure-numpy energy-envelope state machine. Fires a wake event when it sees a short speech burst (0.2-1.5 s) sized to match a single spoken word like "Sanad", followed by a clear silence. No ML, no lexicon — just amplitude classification. -Adaptive noise-floor baseline: learns ambient RMS during idle, raises the effective threshold proportionally, so the detector works the same in a quiet room and a noisy lab. Captures the triggering burst audio (`get_last_burst()`) so callers can verify it was actually "Sanad" before acking. Exists because Vosk/Whisper both failed on the G1 far-field mic for short non-English proper nouns. +#### `builtin_tts.py` (~120 lines) +Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Used by `API/audio_api.speak()` to render the brain's spoken replies. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input. -**Exports:** -- `WakeDetector(cfg)` with `WakeConfig(sample_rate, speech_threshold, min_word_duration_s, max_word_duration_s, post_silence_s, cooldown_s, chunk_ms, adaptive_window_n, adaptive_mult, diag_log_sec)` -- `process(pcm_bytes) -> bool` — feed audio, returns True once per spoken "word" -- `reset()`, `get_last_burst() -> np.ndarray | None` +**Exports:** `BuiltinTTS(audio_client, default_speaker_id=0)`, `.speak(text, speaker_id=None, block=True)`. -#### `marcus_voice.py` (~1000 lines) -Voice orchestrator. Reads from `BuiltinMic`, runs the `WakeDetector`, verifies the wake burst with a lightweight Whisper decode, records the command with hysteretic VAD (speech_entry / silence_exit thresholds, adaptive to measured ambient), trims leading silence before Whisper, transcribes with faster-whisper, fuzzy-matches against `command_vocab` to canonicalize near-misses ("Turn right up" → "turn right"), then dispatches to the brain callback. +#### `gemini_script.py` (~458 lines) +The STT brain. `GeminiBrain` opens a Gemini Live session over WebSocket (`google-genai` SDK) configured with `response_modalities=["TEXT"]` and `input_audio_transcription`. A `_send_mic_loop` coroutine streams 512-sample int16 PCM blobs at 16 kHz; a `_receive_loop` coroutine extracts `server_content.input_transcription.text` and fires `on_transcript` + `on_command` callbacks. No audio comes back — Gemini's text reply is logged but never played. -Three operating modes selectable via `stt.mode`: -- `wake_and_command` (default): classic acoustic wake → TTS "Yes" → record → Whisper → brain -- `always_on`: no wake, transcribe every utterance, dispatch all -- `always_on_gated`: transcribe everything, only dispatch utterances containing "Sanad" +Reconnect-safe: 660 s session timeout, exponential backoff (cap 30 s), client recreated after 10 consecutive errors, 30 s no-message dead-session detector. All values match Sanad's `voice_config.json::sanad_voice`. -Wake verify rule: Whisper's decode must either contain a wake-word variant (`stt.wake_words`) OR start with `s/sh/z` — Whisper's consistent signature for mishearing "Sanad" as "Stop"/"Set"/"Sand". Pure silence / non-s speech is rejected silently. +`start()/stop()` are synchronous wrappers that run `async run()` inside a worker thread's asyncio loop — Marcus's `VoiceModule` is threaded, so this adapter is the only Marcus-specific addition vs Sanad's structure. -**Module-level** (populated at `VoiceModule.__init__` from config): -- `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` — loaded from `config_Voice.json::stt.*`, single source of truth -- `_has_wake_word(text)`, `_strip_wake_word(text)` — iterative until stable, handles "Sanad. Sanad." → "" +**Exports:** `GeminiBrain(audio_io, recorder, voice_name, system_prompt, *, api_key, on_transcript, on_command)` + `start()/stop()`. + +#### `turn_recorder.py` (~158 lines) +Per-turn WAV saver. `capture_user(pcm)` and `add_user_text(text)` buffer in RAM until `finish_turn()` flushes one `_user.wav` (16 kHz int16 mono) plus an `index.json` entry per turn with `user_text` + `robot_text` (Gemini's text reply, kept for review even though never spoken). In STT-only mode, `_robot.wav` is **not** written — there is no PCM coming back from Gemini to capture; the actual robot voice is generated on demand by TtsMaker and never flows through this recorder. + +**Exports:** `TurnRecorder(enabled, out_dir, user_rate, robot_rate)` + `capture_user`, `capture_robot`, `add_user_text`, `add_robot_text`, `finish_turn`. + +#### `marcus_voice.py` (~450 lines) +Voice orchestrator. `VoiceModule.__init__` loads `WAKE_WORDS / COMMAND_VOCAB / GARBAGE_PATTERNS` from `config_Voice.json::stt.*`. `_voice_loop_gemini` builds `AudioIO.from_profile("builtin", audio_client=ac)`, instantiates `TurnRecorder`, then constructs and starts a `GeminiBrain` with two callbacks: + +- `on_transcript(text)` → writes a `HEARD ...` line to `logs/transcript.log`. +- `on_command(text, "en")` → `_dispatch_gemini_command`: gates on `_has_wake_word(text)` (must contain "Sanad" or a fuzzy variant), strips the wake word, fuzzy-matches against `command_vocab` for canonicalization (e.g. "Turn right up" → "turn right"), dedups partial transcripts within `command_cooldown_sec`, then forwards the cleaned text to `Brain.marcus_brain.process_command(...)` via the user's `on_command` callback. + +`flush_mic()` drops any buffered mic audio — called by `Brain/marcus_brain._on_command` before AND after `_audio_api.speak(reply)` so TtsMaker output isn't transcribed back into Gemini as a fake user utterance. + +**Module-level** (populated at `__init__` from config): +- `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` — single source of truth +- `_has_wake_word(text)`, `_strip_wake_word(text)` — iterative; handles "Sanad. Sanad." → "" - `_closest_command(text, cutoff)` — difflib fuzzy-match against `COMMAND_VOCAB` **Exports:** - `VoiceModule(audio_api, on_command=cb, on_wake=None)` — init - `start()` / `stop()` — background thread lifecycle -- `is_running` property +- `flush_mic()` — public hook for echo prevention around speak() +- `is_speaking` property — delegates to `AudioAPI.is_speaking` --- diff --git a/Doc/controlling.md b/Doc/controlling.md index a9492b4..8f9bd79 100644 --- a/Doc/controlling.md +++ b/Doc/controlling.md @@ -79,29 +79,30 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765` ## Voice -- **Wake word:** "Sanad" (Whisper mishears it as "Stop", "Sand", "Set", "Send" — all accepted via the /s-/ phonetic rule; see `config_Voice.json::stt.wake_words` for the 33 fuzzy variants). -- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed. -- **Wake detection:** custom energy-envelope state machine (pure numpy, no ML) — fires on any 0.35-1.5 s speech burst followed by silence. Adaptive to room ambient. -- **Wake verify:** lightweight Whisper decode on the triggering burst. Accepts if it contains a wake-word variant OR starts with `s`/`sh`/`z` (Whisper's consistent signature for "Sanad"). Rejects pure noise / non-s speech silently. -- **STT (command):** faster-whisper `base.en` int8 on CPU — loads ~1.5 s on first wake, cached after. -- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only. -- **Barge-in:** the mic is muted during TTS playback, then flushed on return to listening. +- **Wake word:** "Sanad" — gated at dispatch time on Gemini's transcript. Common mishearings ("Sannad", "Senad", "Sa nad", etc.) all accepted via the 33-entry `config_Voice.json::stt.wake_words` fuzzy list. Word-boundary match, not substring (so "standard" doesn't trigger off "sand"). +- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic, no acoustic wake detector. +- **STT:** Gemini Live (`gemini-2.5-flash-native-audio-preview-12-2025`) with `response_modalities=["TEXT"]` — Gemini does the transcription. The mic is streamed in 32 ms chunks; Gemini's server-side VAD decides turn boundaries. **The Gemini WebSocket runs in a separate Python 3.10+ subprocess** (`Voice/gemini_runner.py`) because `google-genai` doesn't support Python 3.8 (which marcus is pinned to). Marcus spawns the runner via the `gemini_sdk` conda env and reads JSON-line transcripts off its stdout. Requires `pip install google-genai` **inside the gemini_sdk env** (not the marcus env) and an API key in `MARCUS_GEMINI_API_KEY` (or `SANAD_GEMINI_API_KEY` fallback). Set `MARCUS_GEMINI_PYTHON` (or `stt.gemini_python_path`) if the gemini_sdk env lives somewhere besides `~/miniconda3/envs/gemini_sdk/`. +- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only. Gemini does NOT speak — only Marcus's brain reply is spoken, via TtsMaker. +- **Echo prevention:** `VoiceModule.flush_mic()` is called by Marcus's brain before AND after `audio_api.speak()` so TtsMaker output isn't transcribed back into Gemini as a fake user utterance. -Interaction flow: say "Sanad" → hear *"Yes"* → speak your command → see transcript on console → Marcus answers through the speaker. +Interaction flow: speak "Sanad" + your request → Gemini transcribes (Marcus prints `USER: ...`) → wake-word gate passes → brain handles it (motion, VLM Q&A, place memory, …) → reply spoken through G1 speaker. -Three voice modes selectable via `config_Voice.json::stt.mode`: -- `wake_and_command` (default) — wake word required before each command -- `always_on` — continuously transcribe + dispatch every utterance -- `always_on_gated` — always listen + log, dispatch only if utterance contains "Sanad" +Examples: +- "Sanad, turn right" → robot turns right, brain says "Done" +- "Sanad, what do you see" → Qwen2.5-VL describes the camera frame, brain speaks the description +- "Sanad" alone (no payload) → no dispatch (the persona prompt tells Gemini to acknowledge silently) +- "what do you see" (no "Sanad") → wake-word gate blocks, no dispatch, no reply (avoids false motion from background chatter) -To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster. +To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only without opening the Gemini WebSocket. -**Tuning knobs** (when false wakes or rejected real wakes) — all in `config_Voice.json::stt`: -- Too many false wakes from coughs/claps → raise `speech_threshold` or `min_word_duration` -- Real "Sanad" being rejected → check the log line `wake REJECTED — %r` to see what Whisper heard; widen `wake_words` if needed -- Commands transcribed wrong → check `whisper: lp=%.2f nsp=%.2f text=%r` log line; lower `whisper_no_speech_threshold` or tighten `whisper_log_prob_threshold` -- "I didn't catch that" on silence → raise `min_transcription_length` -- Latency too high → set `wake_ack: "none"` (skip "Yes" TTS, save ~1.7 s/cycle) +**Tuning knobs** — all in `config_Voice.json::stt`: +- Real "Sanad" misheard by Gemini and not matching wake_words → check `logs/transcript.log` for the `HEARD` line, add the variant to `wake_words` +- Commands transcribed wrong → field accuracy is mostly Gemini's job; for room-specific tuning try `gemini_vad_silence_duration_ms` (longer = more patience for hesitations) +- VAD too eager / too slow → `gemini_vad_start_sensitivity` (`HIGH` / `LOW`) and `gemini_vad_end_sensitivity` (`LOW` for slow speech, `HIGH` to cut early) +- Filler words triggering dispatch → expand `garbage_patterns` +- Robot too talkative / too terse → edit `gemini_system_prompt` (or point `gemini_system_prompt_file` at a `.txt` for richer personas) +- Session reconnects too aggressive → raise `gemini_max_consecutive_errors` +- Disable per-turn WAV saves → `gemini_record_enabled: false` --- @@ -209,7 +210,7 @@ Control what initializes at boot. Defaults: ``` Set any to `false` to skip that subsystem's init. Boot time drops roughly: -- `voice: false` → ~2 s faster (no Whisper model load) +- `voice: false` → ~1 s faster (no Gemini WebSocket open, no mic thread) - `lidar: false` → ~1 s faster (no SLAM subprocess spawn) - `imgsearch: false` → already the default; re-enable only when you need `search/ …` - `autonomous: false` → minor, but removes the AutonomousMode init @@ -244,7 +245,10 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json: | `llama runner process has terminated: %!w()` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` | | Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only | | `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast | -| Wake word never fires | Energy burst below floor, or Whisper verify rejecting | Check `logs/voice.log` — if you see `wake REJECTED — 'X'`, add X's root variant to `config_Voice.json::stt.wake_words`. If `baseline=0` persists, your ambient exceeds the floor — raise `speech_threshold`. | +| Wake word never fires | Gemini transcribed but `_has_wake_word` rejected | Check `logs/transcript.log` — if `HEARD ...` shows what Gemini heard but no `CMD ...` follows, the transcript has a misheard "Sanad" variant; add the root form to `config_Voice.json::stt.wake_words`. | +| Voice silent on boot | Missing Gemini API key | Check `logs/voice.log` for `No Gemini API key found`. Set `export MARCUS_GEMINI_API_KEY='...'` before launching `run_marcus.py`. | +| `google-genai not installed` in runner stderr | Package missing in gemini_sdk env | Activate the gemini_sdk conda env and `pip install google-genai` THERE (not in marcus). | +| `no Python 3.10+ env found for the Gemini runner` | gemini_sdk env in non-default path | Set `export MARCUS_GEMINI_PYTHON=/path/to/gemini_sdk/bin/python` or edit `stt.gemini_python_path`. | | Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" | | `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` | | Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up | @@ -257,7 +261,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json: |------|------| | Brain code | `~/Marcus/Brain/` | | Server | `~/Marcus/Server/marcus_server.py` | -| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,wake_detector,marcus_voice}.py` | +| Voice | `~/Marcus/Voice/{audio_io,builtin_mic,builtin_tts,gemini_script,turn_recorder,marcus_voice}.py` | | Config | `~/Marcus/Config/` | | Prompts | `~/Marcus/Config/marcus_prompts.yaml` | | YOLO model | `~/Marcus/Models/yolov8m.pt` | diff --git a/Doc/environment.md b/Doc/environment.md index 94f3817..a6fef92 100644 --- a/Doc/environment.md +++ b/Doc/environment.md @@ -142,7 +142,8 @@ All 25 project modules import cleanly from the `marcus` env at `/home/unitree/Ma ``` OK Core.config_loader Core.env_loader OK Core.log_backend Core.logger -OK Voice.builtin_mic Voice.builtin_tts Voice.marcus_voice +OK Voice.audio_io Voice.builtin_mic Voice.builtin_tts +OK Voice.gemini_script Voice.turn_recorder Voice.marcus_voice OK Vision.marcus_yolo Vision.marcus_imgsearch OK API.llava_api API.yolo_api API.camera_api OK API.zmq_api API.imgsearch_api API.odometry_api @@ -384,5 +385,6 @@ Config file (`Config/config_Vision.json`): | 2026-04-21 | **Subprocess leak fix**: `AudioAPI._record_parec` now wraps `Popen` in try/finally with `terminate → wait(1.0) → kill` fallback; orphan `parec` processes can no longer survive Ctrl-C. Last-resort `proc.kill()` catches only `OSError` (not bare `except`). | | 2026-04-21 | **Modelfile corrected**: `Models/Modelfile` now `FROM qwen2.5vl:3b` (was `:7b`) with a header explaining it's an optional build template — runtime uses `ollama pull qwen2.5vl:3b` directly. | | 2026-04-21 | **Final verification**: 14-dimension smoke test green — no Arabic, no dead dirs, 0 orphan keys, every FileHandler rotates, no bare `except: pass`, no stale `Models_marcus` / `marcus_llava` refs, 25/25 modules import. | -| 2026-04-24 | **Voice finalised on faster-whisper + custom energy wake**. Added `Voice/wake_detector.py` (pure-numpy energy state machine, adaptive noise floor, burst-audio capture for verify). Rewrote `Voice/marcus_voice.py` around it: three operating modes (`wake_and_command` / `always_on` / `always_on_gated`), hysteretic record VAD, pre-speech silence trim (300 ms pre-roll preserved), faster-whisper `base.en` int8 CPU decode, fuzzy-match canonicalisation against `command_vocab`, `GARBAGE_PATTERNS` + length filter for noise hallucinations, `/s-/` phonetic wake verify (accepts Whisper mishearings of "Sanad" like "Stop"/"Set"/"Sand"). Tried and reverted: Gemini Live WebSocket (Python 3.8 incompatibility + latency), Vosk grammar STT (English lexicon can't decode "Sanad"; big model cold-load too slow on Jetson). All voice tunables (33 wake_words, 68 command_vocab, 17 garbage_patterns, ~25 threshold/VAD/Whisper keys) live in `config_Voice.json::stt.*` — zero hardcoded strings in Voice/. | +| 2026-04-24 | **Voice finalised on faster-whisper + custom energy wake** (later replaced — see 2026-04-25). Added `Voice/wake_detector.py` + rewrote `Voice/marcus_voice.py` around it with three modes (`wake_and_command` / `always_on` / `always_on_gated`), hysteretic record VAD, faster-whisper `base.en` int8 CPU decode. Field testing on the G1 far-field mic showed unacceptable transcription error rates regardless of tuning. | | 2026-04-24 | **Command parser widened**: `Brain/command_parser.py` now has `_RE_SIMPLE_DIR` (`left`, `go back`, `move forward`, `step right`, etc.) and `_RE_STOP_SIMPLE` (`stop`, `halt`, `wait`, `pause`, `freeze`) regex fast-paths — these bare-direction / bare-stop commands now skip Qwen entirely (~50 ms vs ~5 s). Motion velocities and step duration pulled from `config_Navigation.json::{move_map, step_duration_sec}` via `API/zmq_api.py`; command_parser no longer contains hardcoded `0.3` / `2.0` magic numbers. | +| 2026-04-25 | **Voice rewritten on Gemini Live (Sanad-pattern port)**. Replaced the wake_detector + faster-whisper + (briefly attempted) Moonshine paths with Gemini Live STT-only (`response_modalities=["TEXT"]`). New files mirror Sanad's `voice/audio_io.py` + `voice/sanad_voice.py::TurnRecorder` + `gemini/script.py` structure: **`Voice/audio_io.py`** (Mic/Speaker ABCs + BuiltinMic/BuiltinSpeaker + AudioIO.from_profile factory), **`Voice/turn_recorder.py`** (per-turn WAV saver), **`Voice/gemini_script.py`** (`GeminiBrain` STT-only, threaded asyncio adapter around Sanad's async `run()`). `Voice/builtin_mic.py` becomes a backward-compat shim. `Voice/wake_detector.py` deleted. `Voice/marcus_voice.py` shrinks 1578 L → 438 L: `_voice_loop_gemini` builds AudioIO, spawns GeminiBrain, dispatch gate (`_dispatch_gemini_command`) requires "Sanad" + fuzzy-match `command_vocab` then forwards to brain. Added `flush_mic()` hook called by `Brain/marcus_brain._on_command` around `audio_api.speak()` to prevent TtsMaker echo from being transcribed. `Config/config_Voice.json` rewritten: dropped `stt.backend`, `stt.mode`, all `whisper_*` / `moonshine_*` / wake-detector / VAD-record knobs, all barge-in keys (no Gemini audio-out anymore); kept Sanad-matching values for `mic_udp`, `speaker`, Gemini VAD/session/voice settings. Single cloud dependency: env `MARCUS_GEMINI_API_KEY` (or `SANAD_GEMINI_API_KEY` fallback) + `pip install google-genai` on Jetson. Mechanism / tunables / interruption identical to Sanad; only difference is response modality (TEXT vs AUDIO) and command-handling gate (wake-word-required vs Sanad's `trigger_enabled` master flag). | diff --git a/Doc/functions.md b/Doc/functions.md index 1aaea92..776df94 100644 --- a/Doc/functions.md +++ b/Doc/functions.md @@ -49,26 +49,37 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra --- -## `Voice/` — mic + TTS + wake + STT +## `Voice/` — audio I/O + Gemini Live STT + TtsMaker | File | Public API | |---|---| -| `builtin_mic.py` | `_find_g1_local_ip()` + **class `BuiltinMic`** | -| `builtin_tts.py` | **class `BuiltinTTS`** | -| `wake_detector.py` | **dataclass `WakeConfig`** + **class `WakeDetector`** | +| `audio_io.py` | `_find_g1_local_ip()`, `_resample_int16`, `_as_int16_array`, abstract **classes `Mic`, `Speaker`**, concrete **classes `BuiltinMic`, `BuiltinSpeaker`**, **dataclass `AudioIO`** with `from_profile()` factory | +| `builtin_mic.py` | **class `BuiltinMic`** (subclass of `audio_io.BuiltinMic` + `read_seconds()` for `AudioAPI.record()`) | +| `builtin_tts.py` | **class `BuiltinTTS`** (used by `AudioAPI.speak()`) | +| `gemini_script.py` | module-level `_load_voice_cfg()`, `_audio_energy()`, **class `GeminiBrain`** | +| `turn_recorder.py` | **class `TurnRecorder`** | | `marcus_voice.py` | module-level `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` (populated from config), helpers `_has_wake_word`, `_strip_wake_word`, `_strip_wake_word_once`, `_closest_command`, **class `VoiceModule`** | -**`Voice.builtin_mic.BuiltinMic`** — G1 UDP multicast mic: -`__init__(group, port, buf_max, read_timeout)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `read_seconds(seconds)`, `flush()`; internal `_recv_loop`. +**`Voice.audio_io.BuiltinMic`** — G1 UDP multicast mic (Sanad-pattern port): +`__init__(group, port, buf_max)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `flush()`; internal `_recv_loop`. + +**`Voice.audio_io.BuiltinSpeaker`** — streaming wrapper over `AudioClient.PlayStream` (built but idle in STT-only mode; TtsMaker owns the speaker): +`__init__(audio_client, app_name=None)`, `begin_stream()`, `send_chunk(pcm, source_rate)`, `wait_finish()`, `stop()`, properties `interrupted`, `total_sent_sec`. Internal `_stop_play_api()`. + +**`Voice.audio_io.AudioIO`** — paired mic + speaker bundle: +`@classmethod from_profile(profile_id, *, audio_client=None) -> AudioIO`, `start()`, `stop()`. Only `"builtin"` profile supported (Anker/Hollyland USB profiles dropped). **`Voice.builtin_tts.BuiltinTTS`** — wraps `AudioClient.TtsMaker`: `__init__(audio_client, default_speaker_id=0)`, `speak(text, speaker_id=None, block=True)`. -**`Voice.wake_detector.WakeDetector`** — pure-numpy energy wake: -`__init__(cfg: WakeConfig)`, `process(pcm_bytes) -> bool`, `reset()`, `get_last_burst() -> np.ndarray | None`. Internal: `_step(window)` state-machine per 50 ms analysis window; adaptive `_baseline_buf` rolling mean of idle-silence RMS; captures triggering burst audio for post-hoc Whisper verify. +**`Voice.gemini_script.GeminiBrain`** — Gemini Live STT-only brain (Sanad `gemini/script.py` port): +`__init__(audio_io, recorder, voice_name=None, system_prompt="", *, api_key, on_transcript=None, on_command=None)`, `start()`, `stop()`, `async run()`. Internal: `_thread_main()` runs an asyncio loop in a worker thread, `_build_config(types)` returns `LiveConnectConfig(response_modalities=["TEXT"], input_audio_transcription, system_instruction)`, `_send_mic_loop(session, types)` streams 32 ms PCM chunks, `_receive_loop(session)` extracts `input_transcription.text` → callbacks + `model_turn` text → log + recorder. -**`Voice.marcus_voice.VoiceModule`** — voice orchestrator. Drives the wake detector, verifies each fire with a lightweight Whisper decode (wake-word substring OR /s-/ phonetic match), records commands with a hysteretic VAD, trims pre-speech silence, transcribes via faster-whisper, fuzzy-normalises near-misses to canonical commands, dispatches to brain. -`__init__(audio_api, on_command=None, on_wake=None)`, `start()`, `stop()`, `is_running` property. Internal: `_get_fw()` lazy faster-whisper loader, `_read_mic_raw` / `_read_mic_gained`, `_record_command()` with adaptive VAD + pre-silence trim, `_transcribe(audio)` Whisper decode + garbage filter, `_transcribe_command(audio)` thin wrapper, `_normalize_command(text)` fuzzy-match to `COMMAND_VOCAB`, `_handle_wake()` / `_voice_loop()` / `_voice_loop_wake()` / `_voice_loop_always_on(gated)`, `_save_unk_wav(audio)` for post-mortem debugging. +**`Voice.turn_recorder.TurnRecorder`** — per-turn WAV saver: +`__init__(enabled, out_dir, user_rate, robot_rate)`, `capture_user(pcm_bytes)`, `capture_robot(pcm_bytes)`, `add_user_text(text)`, `add_robot_text(text)`, `finish_turn() -> dict`. Internal: `_save_wav`, `_append_index`. In STT-only mode `_robot.wav` is never written (Gemini emits text, not audio). + +**`Voice.marcus_voice.VoiceModule`** — voice orchestrator. Builds `AudioIO.from_profile("builtin", audio_client=ac)`, spawns `GeminiBrain` with `_on_gemini_transcript` (transcript log) and `_dispatch_gemini_command` (wake-word gate + fuzzy match → on_command callback) hooks. Forwards every "Sanad + X" transcript to Marcus's brain via the user-supplied `on_command` callback. +`__init__(audio_api, on_command=None, on_wake=None)`, `start()`, `stop()`, `flush_mic()`, `is_speaking` property. Internal: `_voice_loop` (calls `_voice_loop_gemini`), `_voice_loop_gemini` (assembles AudioIO + TurnRecorder + GeminiBrain), `_on_gemini_transcript(text)`, `_dispatch_gemini_command(text, lang)`, `_normalize_command(text)`. The `flush_mic()` hook is called by `Brain/marcus_brain._on_command` before AND after `audio_api.speak()` to prevent TtsMaker output from being transcribed back as user input. --- @@ -156,9 +167,11 @@ from API.lidar_api import init_lidar, obstacle_ahead, get_slam_pose, stop_lida from API.memory_api import init_memory, log_cmd, log_detection, place_save, place_goto # voice pipeline -from Voice.marcus_voice import VoiceModule -from Voice.builtin_mic import BuiltinMic -from Voice.builtin_tts import BuiltinTTS +from Voice.marcus_voice import VoiceModule +from Voice.audio_io import AudioIO, BuiltinMic, BuiltinSpeaker +from Voice.builtin_tts import BuiltinTTS # used by AudioAPI.speak() +from Voice.gemini_script import GeminiBrain +from Voice.turn_recorder import TurnRecorder # navigation from Navigation.goal_nav import navigate_to_goal diff --git a/Doc/pipeline.md b/Doc/pipeline.md index a7d50a8..f0450d1 100644 --- a/Doc/pipeline.md +++ b/Doc/pipeline.md @@ -44,70 +44,83 @@ Subsystem flags live in `config_Brain.json::subsystems`. Current defaults: ## Voice pipeline (when `subsystems.voice = true`) +Marcus uses **Gemini Live STT-only** for the user's mic plus **G1 TtsMaker** for the brain's spoken reply. No local wake detector — Gemini's server-side VAD decides turn boundaries; the wake-word check happens at dispatch time on the transcribed text. + ``` G1 body mic (array) └─ UDP multicast 239.168.123.161:5555 ── int16 mono 16 kHz PCM ▼ -Voice/builtin_mic.py::BuiltinMic - ring buffer (64 KB) + read_chunk(n) +Voice/audio_io.py::BuiltinMic + ring buffer (64 KB) + read_chunk(n) (Sanad-pattern; see audio_io.py) ▼ -Voice/wake_detector.py::WakeDetector - pure-numpy energy state machine (SILENCE ⇄ SPEAKING) - adaptive noise floor: eff_threshold = max(speech_threshold, baseline × 3) - fires on 0.35-1.5 s bursts followed by 0.3 s silence → captures burst audio +Voice/gemini_script.py::GeminiBrain (asyncio worker thread) + ├─ client.aio.live.connect(model="gemini-2.5-flash-native-audio-preview-12-2025", + │ config=LiveConnectConfig( + │ response_modalities=["TEXT"], ← STT-only + │ input_audio_transcription={}, + │ realtime_input_config=AutomaticActivityDetection( + │ start_of_speech_sensitivity=HIGH, + │ end_of_speech_sensitivity=LOW, + │ prefix_padding_ms=20, + │ silence_duration_ms=200), + │ system_instruction=)) + ├─ _send_mic_loop → 512-sample PCM chunks (32 ms each) → session.send_realtime_input + ├─ _receive_loop → server_content.input_transcription.text → on_transcript + on_command + └─ on turn_complete → recorder.finish_turn() → "listening" log ▼ -Voice/marcus_voice.py::VoiceModule._handle_wake() - ├─ 1. Whisper verify on the burst audio: - │ text = faster-whisper(burst) - │ accept if _has_wake_word(text) OR startswith(s/sh/z) - │ reject otherwise (cough, clap, hello, okay) → silent return - ├─ 2. audio_api.speak("Yes") → G1 body speaker (~1.5 s) - ├─ 3. post_tts_settle_sec wait + mic flush - ├─ 4. _record_command() — hysteretic VAD - │ speech_entry_rms / silence_exit_rms (adapt from wake baseline) - │ trim leading silence (keep 300 ms pre-roll) → tight clip for Whisper - ├─ 5. _transcribe(audio) - │ faster-whisper (base.en int8 CPU) - │ beam_size=5, temperature=0, initial_prompt bias toward Sanad vocab - │ GARBAGE_PATTERNS + min_transcription_length reject noise hallucinations - ├─ 6. _normalize_command(text) +Voice/marcus_voice.py::VoiceModule._dispatch_gemini_command(text, "en") + ├─ 1. _has_wake_word(text) + │ match any of stt.wake_words variants as a whole word — else return early + ├─ 2. _strip_wake_word(text) + │ iterative until stable, "Sanad. Sanad." → "" / "Sanad turn right" → "turn right" + ├─ 3. garbage / min-length filter + │ skip "okay"/"thanks"/single-letter unless command_vocab matches exactly + ├─ 4. _normalize_command(stripped) │ difflib fuzzy-match vs stt.command_vocab │ "Turn right up" → "turn right" (canonical form) - └─ 7. on_command(text, "en") + ├─ 5. dedup vs last_gemini_canon within command_cooldown_sec + └─ 6. on_command(text, "en") ▼ -Brain/marcus_brain.py::process_command(text) - ├─ regex fast-path → Brain/command_parser.py::try_local_command() - │ places · odometry walk/turn · patrol · session recall · goal_nav - │ + SIMPLE_DIR ("go back", "right", "forward") · STOP_SIMPLE ("stop", "halt") - │ + NAT_GOAL_RE (naturalised goals like "the chair") · auto on/off - │ (~50 ms when matched — NO LLM call) - └─ else → _handle_llava(text) - ├─ get_frame() (10×50 ms poll, no 1 s stall) - ├─ API/llava_api.py::ask(text, img) - │ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120) - │ → parse_json() → {actions, arm, speak, abort} - └─ Brain/executor.py::execute(d) - ├─ actions → MOVE_MAP[dir] → API/zmq_api.py::send_vel → Holosoma - ├─ arm → API/arm_api.py (stub for now) - └─ abort → gradual_stop() - ▼ -result["speak"] → audio_api.speak(reply) +Brain/marcus_brain.py::_on_command (closure inside init_voice) + ├─ flush_mic() ← drop pending mic audio + ├─ result = process_command(text) + │ ├─ regex fast-path → Brain/command_parser.py::try_local_command() + │ │ places · odometry walk/turn · patrol · session recall · goal_nav + │ │ + SIMPLE_DIR ("go back", "right", "forward") · STOP_SIMPLE ("stop", "halt") + │ │ + NAT_GOAL_RE (naturalised goals like "the chair") · auto on/off + │ │ (~50 ms when matched — NO LLM call) + │ ├─ _TALK_PATTERNS ("what / who / where / …") → _handle_talk(cmd) + │ │ → API/llava_api.py::ask_talk(...) → Qwen2.5-VL + │ └─ else → _handle_llava(text) + │ ├─ get_frame() (10×50 ms poll, no 1 s stall) + │ ├─ API/llava_api.py::ask(text, img) + │ │ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120) + │ │ → parse_json() → {actions, arm, speak, abort} + │ └─ Brain/executor.py::execute(d) + │ ├─ actions → MOVE_MAP[dir] → API/zmq_api.py::send_vel → Holosoma + │ ├─ arm → API/arm_api.py (stub for now) + │ └─ abort → gradual_stop() + ├─ audio_api.speak(result["speak"]) ← TtsMaker via G1 firmware + └─ flush_mic() ← drop the speaker's echo from mic buffer ▼ API/audio_api.py::speak(text, lang="en") - ├─ mute mic (flush BuiltinMic buffer) ├─ Voice/builtin_tts.py::BuiltinTTS.speak(text) │ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only │ time.sleep(len(text) * 0.08) - └─ unmute mic → back to listening + └─ → back to listening ``` **Config knobs** (all in `config_Voice.json::stt`): -- Wake: `speech_threshold` (floor), `min_word_duration`, `max_word_duration`, `post_silence`, `wake_cooldown`, `wake_adaptive_mult`, `wake_diag_log_sec` -- Verify: `wake_verify_enabled` -- Record: `speech_entry_rms`, `silence_exit_rms`, `silence_duration_sec`, `max_record_sec`, `min_record_sec`, `ambient_mult`, `ambient_cap_rms` -- Whisper: `whisper_model`, `whisper_compute_type`, `whisper_beam_size`, `whisper_no_speech_threshold`, `whisper_log_prob_threshold`, `whisper_initial_prompt`, `mic_gain` -- Vocab: `wake_words`, `command_vocab`, `garbage_patterns`, `command_vocab_cutoff`, `min_transcription_length` -- Mode: `mode` (`wake_and_command` | `always_on` | `always_on_gated`), `wake_ack` (`tts`|`none`) +- Gemini connection: `gemini_model`, `gemini_voice_name`, `gemini_audio_profile`, `gemini_chunk_size`, `gemini_send_sample_rate` +- Gemini VAD: `gemini_vad_start_sensitivity`, `gemini_vad_end_sensitivity`, `gemini_vad_prefix_padding_ms`, `gemini_vad_silence_duration_ms` +- Gemini session lifecycle: `gemini_session_timeout_sec`, `gemini_max_reconnect_delay_sec`, `gemini_max_consecutive_errors`, `gemini_no_messages_timeout_sec` +- Persona: `gemini_system_prompt` (inline) or `gemini_system_prompt_file` (path) +- Recording (debug WAVs): `gemini_record_enabled` +- Mic gain: `mic_gain` +- Dispatch: `wake_words` (gate), `command_vocab` (fuzzy-match target), `garbage_patterns`, `command_vocab_cutoff`, `min_transcription_length`, `command_cooldown_sec` +- Hardware: `mic_udp.{group,port,buffer_max_bytes,read_timeout_sec}`, `speaker.{dds_interface,volume,app_name,begin_stream_pause_sec,wait_finish_margin_sec}` + +**Env overrides** (highest precedence): `MARCUS_GEMINI_API_KEY` (or `SANAD_GEMINI_API_KEY` fallback), `MARCUS_GEMINI_MODEL`, `MARCUS_GEMINI_VOICE`. --- @@ -188,22 +201,24 @@ Brain/command_parser.py — responds to "lidar status" queries | Knob | Location | Effect | |---|---|---| | `subsystems.lidar` | config_Brain.json | SLAM subprocess on/off | -| `subsystems.voice` | config_Brain.json | BuiltinMic + Whisper + TtsMaker loop on/off | +| `subsystems.voice` | config_Brain.json | Gemini Live STT + dispatch + TtsMaker loop on/off | | `subsystems.imgsearch` | config_Brain.json | image-guided search init on/off | | `subsystems.autonomous` | config_Brain.json | auto-patrol state machine init on/off | | `num_batch`, `num_ctx` | config_Brain.json | llama.cpp compute-graph size (128 / 2048 ≈ 1.8 GiB graph — **do not raise** on 16 GB Jetson) | | `num_predict_main` | config_Brain.json | 120 tokens max for the main JSON reply | | `yolo_device`, `yolo_half` | config_Vision.json | `cuda` / FP16 (hard-required; CPU not allowed) | -| `mic.backend` | config_Voice.json | `builtin_udp` (G1 array) or `pactl_parec` (Hollyland fallback) | +| `mic.backend` | config_Voice.json | `builtin_udp` (G1 array — only option used by Gemini path) | | `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast | | `mic_udp.read_timeout_sec` | config_Voice.json | `BuiltinMic.read_chunk` budget (default 0.04 s) | -| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) | -| `stt.wake_words` | config_Voice.json | 33 fuzzy variants of "Sanad" for the wake-verify substring match | +| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) — used by `AudioAPI.speak()` for the brain's reply | +| `stt.wake_words` | config_Voice.json | 33 fuzzy variants of "Sanad" — wake-word gate at dispatch time | | `stt.command_vocab` | config_Voice.json | 68 canonical command phrases for fuzzy-normalization (`"turn right up"` → `"turn right"`) | -| `stt.garbage_patterns` | config_Voice.json | 17 Whisper noise-hallucinations to reject (`"thanks for watching"`, `"okay"`, etc.) | -| `stt.speech_threshold` etc. | config_Voice.json | energy wake detector thresholds — see `Doc/controlling.md` "Voice" for the full tuning matrix | -| `stt.whisper_*` | config_Voice.json | faster-whisper model, compute type, beam size, confidence gates, bias prompt | -| `stt.mode` | config_Voice.json | `wake_and_command` (default) / `always_on` / `always_on_gated` | +| `stt.garbage_patterns` | config_Voice.json | 17 noise/filler phrases to reject (`"thanks for watching"`, `"okay"`, single letters) | +| `stt.gemini_model` | config_Voice.json | Gemini Live model id (default `gemini-2.5-flash-native-audio-preview-12-2025`); env `MARCUS_GEMINI_MODEL` wins | +| `stt.gemini_api_key` | config_Voice.json | API key fallback (env `MARCUS_GEMINI_API_KEY` or `SANAD_GEMINI_API_KEY` preferred) | +| `stt.gemini_vad_*` | config_Voice.json | server-side VAD start/end sensitivity, prefix padding, silence duration | +| `stt.gemini_session_timeout_sec` | config_Voice.json | reconnect cadence (660 s = Live API session cap) | +| `stt.gemini_record_enabled` | config_Voice.json | save `_user.wav` per turn under `Data/Voice/Recordings/gemini_turns/` | | `timeout_ms`, `stale_threshold_s`, `reconnect_delay_s` | config_Camera.json | RealSense frame timeout, reconnect trigger, initial backoff | | `default_max_steps`, `step_delay_s`, `rotate_speed`, `min_steps_warmup` | config_ImageSearch.json | image-guided search rotation cadence (wired into `Vision/marcus_imgsearch.py`) | | `default_walk_speed`, `dist_tolerance`, `angle_tolerance`, `safety_timeout_mult`, `dr_update_hz` | config_Odometry.json | precise motion control (wired into `Navigation/marcus_odometry.py`) | @@ -211,21 +226,19 @@ Brain/command_parser.py — responds to "lidar status" queries --- -## Per-command latency (estimated, post-fixes) +## Per-command latency (estimated) | Step | Typical | Notes | |---|---|---| -| Wake-word detect | <100 ms | pure-numpy energy detector, 50 ms analysis windows | -| Wake verify (first wake) | ~2000 ms | includes faster-whisper `base.en` cold load | -| Wake verify (subsequent) | 300–700 ms | Whisper cached, decodes ~0.5-1.5 s burst | -| "Yes" TTS ack | ~1500 ms | G1 firmware `TtsMaker` minimum | -| Record until silence | 1–5 s | depends on user speech; `max_record_sec=5` cap | -| Pre-silence trim | <1 ms | numpy slice | -| faster-whisper STT | 500–1500 ms | `base.en` int8 on CPU, beam_size=5 | -| Fuzzy-match canonicalisation | <1 ms | difflib against 68 phrases | +| Mic chunk → Gemini Live | ~32 ms | 512-sample PCM blob over WebSocket | +| Gemini server-side VAD turn-end | ~200 ms | configurable via `gemini_vad_silence_duration_ms` (default 200) | +| Gemini transcript emission | 100–500 ms | depends on utterance length; partials may stream | +| Wake-word check + fuzzy-normalize | <5 ms | `re.search` + difflib against 68 phrases | +| Dispatch dedup | <1 ms | string compare + cooldown | | Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall | | Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` | | Executor + ZMQ send | <10 ms | fire-and-forget PUB | | TtsMaker playback | ~len(text) × 80 ms | synthesizes + plays on robot | +| `flush_mic` × 2 | <1 ms each | bracketed around `audio_api.speak()` | -**Total wake → answer-playback:** ~**2.5–4 s** for a short vision question like "what do you see" (vs. 5–8 s with the pre-restructure edge-tts/Gemini overhead). +**Total user-stops-talking → answer-playback:** ~**1.5–3 s** for a short vision question like "Sanad, what do you see" — Gemini's instant turn-detection saves the 2 s "Yes" ack the previous Whisper-era pipeline needed. diff --git a/README.md b/README.md index d96d99f..a74761f 100644 --- a/README.md +++ b/README.md @@ -25,15 +25,19 @@ a Python brain. | **Brain** (reason, speak, decide) | Parse commands, reason about vision, pick actions | **Qwen2.5-VL 3B** via Ollama | Jetson GPU | | **Eyes** (see) | Real-time object/person detection | **YOLOv8m** (CUDA, FP16, 320 px, ~22 FPS) | Jetson GPU | | **Eyes** (understand) | Open-ended scene understanding, reading, goal-verify | **Qwen2.5-VL** (same brain model) | Jetson GPU | -| **Ears** (hear) | Energy-based wake detector + command transcription | **Custom DSP wake** (numpy, no ML) + **faster-whisper base.en int8** (STT) | Jetson CPU | -| **Mouth** (speak) | On-robot TTS, no internet needed | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker | +| **Ears** (hear) | Mic capture + speech-to-text | G1 UDP multicast mic + **Gemini Live STT** (`gemini-2.5-flash-native-audio-preview`, `response_modalities=["TEXT"]`, server-side VAD) | Jetson → Google API | +| **Mouth** (speak) | On-robot TTS for the brain's spoken replies | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker | | **Legs** (walk) | 29-DoF locomotion + balance | **Holosoma** RL policy (separate process, ONNX) | Jetson CPU | | **Hands** (gesture) | Arm & hand actions | **GR00T N1.5** — pending; `API/arm_api.py` is a stub today | Jetson GPU (future) | | **Inner ear** (map) | SLAM, obstacle detection, localisation | **Livox Mid-360** LiDAR + custom SLAM engine | Jetson (subprocess) | | **Memory** | Places, session history, facts | JSON files under `Data/Brain/Sessions/` | Jetson disk | -Nothing here reaches the cloud. The only internet-adjacent bits (edge-tts, -Gemini) were removed — everything runs on the robot's own compute. +Almost everything runs on-robot. The single cloud dependency is **Gemini Live** +for speech-to-text — chosen because the G1's far-field mic + Whisper-on-CPU +combination produced too many transcription errors during real-world testing. +Vision (YOLO + Qwen2.5-VL), reasoning, motion, navigation, memory, LiDAR — all +local on the Jetson. TTS replies still go through G1's on-board `TtsMaker`, +not Gemini. --- @@ -54,7 +58,7 @@ Camera ─┘ ▼ ├─► Legs (Holosoma Three input modalities, same command loop: -- **Voice** — say "**Sanad**" → energy detector fires, Whisper verifies the /sa-/ phoneme signature, robot replies "Yes" → speak your command → faster-whisper transcribes → brain answers through the G1 speaker. +- **Voice** — Gemini Live streams the mic continuously and emits transcripts. When the transcript starts with "**Sanad**" plus a request, Marcus's brain handles it (motion / VLM / Q&A) and replies through the G1 speaker via TtsMaker. No local wake detector, no acoustic ack — Gemini's server-side VAD decides when you've stopped speaking. - **Text** — type the same command into `run_marcus.py`'s terminal. - **WebSocket (remote)** — `Client/marcus_cli.py` or `Client/marcus_client.py` (Tkinter GUI) send commands from a workstation. @@ -84,8 +88,8 @@ There are two schools for combining them: | Vision — open-ended scene understanding | same VLM | learned | | Legs / locomotion | **RL policy** (Holosoma, ONNX) | learned | | Arms / gestures | SDK action-ID lookup | **hand-coded** | -| Wake word | Custom energy-envelope DSP (numpy) | hand-coded | -| STT (command) | faster-whisper base.en | learned | +| Wake word gating | String match on `command_vocab` after Gemini transcribes | hand-coded | +| STT (command) | Gemini Live (`gemini-2.5-flash-native-audio-preview`) | cloud-hosted | | TTS | Unitree `TtsMaker` (on-robot DSP) | firmware | | Glue between layers | Python + ZMQ + JSON | hand-coded | @@ -144,7 +148,7 @@ Same hardware, different prompts + wake word. - **Prompts** rewrite: *"You are a museum guide. When a visitor asks about an exhibit, describe it in two sentences and invite them to ask follow-ups."* - **Places** memory pre-loaded with exhibit waypoints; `patrol: exhibit_A → exhibit_B → exit` follows a tour. -- Wake word variants in `config_Voice.json::stt.wake_words` (fuzzy list, handles Whisper mishearings of "Sanad"). +- Wake word variants in `config_Voice.json::stt.wake_words` (fuzzy list, handles common mishearings of "Sanad" Gemini sometimes emits). - Image search (`search/ photo_of_exhibit.jpg`) lets visitors hold up a printed map; the robot navigates to the matching location. - YOLO classes trimmed to people-only if the venue doesn't need object safety. @@ -173,9 +177,10 @@ No code changes required for either deployment. ┌───────┴────────┬──────────────┬────────────┐ ▼ ▼ ▼ ▼ Vision/ Navigation/ Voice/ Lidar/ - YOLO, imgsearch goal_nav, builtin_mic, SLAM engine + YOLO, imgsearch goal_nav, audio_io, SLAM engine patrol, odom builtin_tts, (subprocess) - wake_detector, + gemini_script, + turn_recorder, marcus_voice │ ▼ @@ -202,7 +207,20 @@ cd ~/holosoma && python3 src/holosoma_inference/.../run_policy.py ... ollama serve > /tmp/ollama.log 2>&1 & sleep 3 -# 3) Start Marcus +# 3) Install the Gemini SDK in its own Python 3.10+ env (one-time) +# google-genai requires Python ≥3.9; marcus is pinned to 3.8 by the +# Jetson torch wheel, so Gemini runs in a sibling conda env. +conda create -n gemini_sdk python=3.10 -y +conda activate gemini_sdk +pip install google-genai numpy +conda deactivate + +# 4) Provide the Gemini key (voice is the only cloud dep) +export MARCUS_GEMINI_API_KEY='' # SANAD_GEMINI_API_KEY also accepted +# Optional: only needed if gemini_sdk env is NOT at ~/miniconda3/envs/gemini_sdk/ +# export MARCUS_GEMINI_PYTHON=/path/to/gemini_sdk/bin/python + +# 5) Start Marcus conda activate marcus cd ~/Marcus python3 run_marcus.py @@ -255,7 +273,7 @@ Marcus/ ├── Brain/ orchestrator, parser, executor, memory ├── Vision/ YOLO + image-guided search ├── Navigation/ goal nav, patrol, odometry -├── Voice/ built-in mic, TTS, energy wake detector, faster-whisper STT +├── Voice/ audio I/O (mic + speaker), Gemini Live STT, TtsMaker ├── Autonomous/ exploration state machine ├── Lidar/ SLAM engine (subprocess) ├── Server/ WebSocket interface @@ -283,16 +301,19 @@ Marcus/ ## Design principles -1. **Offline-first.** No cloud dependency in the default path. Internet can be - wired in for specific backends (e.g. future edge-tts) but it's opt-in. +1. **Offline-first where it matters.** Vision, reasoning, motion, navigation, + memory, LiDAR — all on the Jetson. The single cloud dependency is Gemini + Live STT (speech in only, text out — Marcus's brain still owns the reply). + It can be swapped for any other STT by reimplementing `Voice/gemini_script.py` + behind the same `start()/stop()` + `on_command(text, lang)` callback. 2. **GPU mandatory.** YOLO refuses to start on CPU — Marcus is a safety-critical robot, silently downgrading to 2 FPS vision is worse than failing loudly. 3. **Swappable subsystems.** Each API file can be reimplemented behind the same public functions. Replace YOLO with DETR, Qwen with LLaVA, TtsMaker with - Piper — Brain never notices. -4. **Config over code.** Tunables live in `Config/*.json` / `.yaml`; 156 config - keys are all actively referenced (0 orphans). Change persona, wake word, - enabled subsystems, or thresholds without touching a `.py` file. + Piper, Gemini STT with Whisper — Brain never notices. +4. **Config over code.** Tunables live in `Config/*.json` / `.yaml`; every key + is actively referenced (0 orphans). Change persona, wake word, enabled + subsystems, or thresholds without touching a `.py` file. 5. **English only.** Arabic support was removed because the G1 firmware's TTS silently maps Arabic to Chinese. If bilingual TTS is ever needed again, see `git log` for the removed Piper / edge-tts paths. diff --git a/Voice/audio_io.py b/Voice/audio_io.py new file mode 100644 index 0000000..170ef75 --- /dev/null +++ b/Voice/audio_io.py @@ -0,0 +1,345 @@ +"""Hardware-agnostic audio I/O for Marcus voice pipelines. + +Direct port of /home/zedx/Robotics_workspace/yslootahtech/Project/Sanad/voice/audio_io.py, +with USB mic/speaker profiles (Anker/Hollyland) removed — Marcus only uses the +G1 on-board profile. Class names and method signatures match Sanad verbatim so +the rest of the Gemini brain code lifts over cleanly. + +Mics deliver int16 mono PCM at 16 kHz. +Speakers accept int16 mono PCM plus a `source_rate` and resample internally. + +Usage: + + audio = AudioIO.from_profile("builtin", audio_client=ac) + audio.start() + try: + chunk = audio.mic.read_chunk(1024) + audio.speaker.begin_stream() + audio.speaker.send_chunk(pcm_24k, 24000) + audio.speaker.wait_finish() + finally: + audio.stop() +""" + +from __future__ import annotations + +import json +import logging +import os +import socket +import struct +import subprocess +import sys +import threading +import time +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Optional, Union + +import numpy as np + +_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if _PROJECT_DIR not in sys.path: + sys.path.insert(0, _PROJECT_DIR) +try: + from Core.config_loader import load_config + _VCFG = load_config("Voice") or {} +except Exception: + _VCFG = {} + +log = logging.getLogger("audio_io") + +_MIC_CFG = _VCFG.get("mic_udp", {}) or {} +_SP_CFG = _VCFG.get("speaker", {}) or {} + +TARGET_MIC_RATE = 16_000 + +_MCAST_GRP = _MIC_CFG.get("group", "239.168.123.161") +_MCAST_PORT = int(_MIC_CFG.get("port", 5555)) +_MIC_BUF_MAX = int(_MIC_CFG.get("buffer_max_bytes", 64_000)) +_MIC_READ_TIMEOUT = float(_MIC_CFG.get("read_timeout_sec", 0.04)) + +PCMLike = Union[bytes, bytearray, memoryview, np.ndarray] + + +def _find_g1_local_ip() -> str: + """Find the host IPv4 address on the G1's internal 192.168.123.0/24 network.""" + out = subprocess.run( + ["ip", "-4", "-o", "addr"], capture_output=True, text=True, + ).stdout + for line in out.splitlines(): + for tok in line.split(): + if tok.startswith("192.168.123."): + return tok.split("/")[0] + raise RuntimeError("no 192.168.123.x interface found") + + +def _resample_int16(pcm: np.ndarray, src_rate: int, dst_rate: int) -> np.ndarray: + if src_rate == dst_rate or pcm.size == 0: + return pcm.astype(np.int16, copy=False) + target_len = max(1, int(len(pcm) * dst_rate / src_rate)) + return np.interp( + np.linspace(0, len(pcm), target_len, endpoint=False), + np.arange(len(pcm)), + pcm.astype(np.float64), + ).astype(np.int16) + + +def _as_int16_array(pcm: PCMLike) -> np.ndarray: + if isinstance(pcm, np.ndarray): + return pcm.astype(np.int16, copy=False) + return np.frombuffer(bytes(pcm), dtype=np.int16) + + +# ─── Protocols ──────────────────────────────────────────── + +class Mic(ABC): + sample_rate: int = TARGET_MIC_RATE + + @abstractmethod + def start(self) -> None: ... + @abstractmethod + def read_chunk(self, num_bytes: int) -> bytes: ... + @abstractmethod + def flush(self) -> None: ... + @abstractmethod + def stop(self) -> None: ... + + +class Speaker(ABC): + @abstractmethod + def begin_stream(self) -> None: ... + + @abstractmethod + def send_chunk(self, pcm: PCMLike, source_rate: int) -> None: + """Queue PCM for playback. `source_rate` is the sample rate of `pcm`.""" + + @abstractmethod + def wait_finish(self) -> None: ... + @abstractmethod + def stop(self) -> None: ... + + @property + @abstractmethod + def interrupted(self) -> bool: ... + + @property + def total_sent_sec(self) -> float: + return 0.0 + + +# ─── G1 built-in (UDP mic + AudioClient speaker) ────────── + +class BuiltinMic(Mic): + """G1 robot's on-board mic published over UDP multicast.""" + + sample_rate = TARGET_MIC_RATE + + def __init__(self, group: str = _MCAST_GRP, port: int = _MCAST_PORT, + buf_max: int = _MIC_BUF_MAX): + self._group = group + self._port = port + self._buf_max = buf_max + self._sock = None # type: Optional[socket.socket] + self._buf = bytearray() + self._lock = threading.Lock() + self._running = False + self._thread = None # type: Optional[threading.Thread] + + def start(self) -> None: + if self._running: + return + local_ip = _find_g1_local_ip() + self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + self._sock.bind(("", self._port)) + mreq = struct.pack( + "4s4s", + socket.inet_aton(self._group), + socket.inet_aton(local_ip), + ) + self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq) + self._sock.settimeout(1.0) + self._running = True + self._thread = threading.Thread(target=self._recv_loop, daemon=True) + self._thread.start() + log.info("BuiltinMic joined %s:%d on %s", self._group, self._port, local_ip) + + def _recv_loop(self) -> None: + while self._running: + try: + data, _ = self._sock.recvfrom(4096) + with self._lock: + self._buf.extend(data) + if len(self._buf) > self._buf_max: + del self._buf[:len(self._buf) - self._buf_max] + except socket.timeout: + continue + except Exception: + if self._running: + time.sleep(0.01) + + def read_chunk(self, num_bytes: int) -> bytes: + deadline = time.time() + _MIC_READ_TIMEOUT + while time.time() < deadline: + with self._lock: + if len(self._buf) >= num_bytes: + chunk = bytes(self._buf[:num_bytes]) + del self._buf[:num_bytes] + return chunk + time.sleep(0.003) + with self._lock: + avail = len(self._buf) + if avail > 0: + chunk = bytes(self._buf[:avail]) + del self._buf[:avail] + return chunk + b"\x00" * (num_bytes - avail) + return b"\x00" * num_bytes + + def flush(self) -> None: + with self._lock: + self._buf.clear() + + def stop(self) -> None: + self._running = False + if self._sock is not None: + try: + self._sock.close() + except Exception: + pass + self._sock = None + + +class BuiltinSpeaker(Speaker): + """G1 robot's built-in speaker via AudioClient.PlayStream (16 kHz mono).""" + + HARDWARE_RATE = 16_000 + + def __init__(self, audio_client: Any, app_name: Optional[str] = None): + self._ac = audio_client + try: + self._ac.SetVolume(100) + except Exception: + log.warning("BuiltinSpeaker.SetVolume failed") + self._app_name = app_name or _SP_CFG.get("app_name", "marcus") + self._begin_pause = float(_SP_CFG.get("begin_stream_pause_sec", 0.15)) + self._finish_margin = float(_SP_CFG.get("wait_finish_margin_sec", 0.3)) + self._stop_flag = threading.Event() + self._stream_id = None # type: Optional[str] + self._total_sent = 0.0 + self._play_start = 0.0 + + def _stop_play_api(self) -> None: + try: + from unitree_sdk2py.g1.audio.g1_audio_api import ( + ROBOT_API_ID_AUDIO_STOP_PLAY, + ) + self._ac._Call( + ROBOT_API_ID_AUDIO_STOP_PLAY, + json.dumps({"app_name": self._app_name}), + ) + except Exception: + log.warning("BuiltinSpeaker AUDIO_STOP_PLAY failed") + + def begin_stream(self) -> None: + self._stop_flag.clear() + self._stop_play_api() + time.sleep(self._begin_pause) + self._stream_id = "s_{}".format(int(time.time() * 1000)) + self._total_sent = 0.0 + self._play_start = time.time() + + def send_chunk(self, pcm: PCMLike, source_rate: int) -> None: + if self._stop_flag.is_set(): + return + arr = _as_int16_array(pcm) + if arr.size < 10: + return + hw = _resample_int16(arr, int(source_rate), self.HARDWARE_RATE) + self._ac.PlayStream(self._app_name, self._stream_id, hw.tobytes()) + self._total_sent += len(hw) / float(self.HARDWARE_RATE) + + def wait_finish(self) -> None: + elapsed = time.time() - self._play_start + remaining = self._total_sent - elapsed + self._finish_margin + waited = 0.0 + while waited < remaining and not self._stop_flag.is_set(): + time.sleep(0.1) + waited += 0.1 + self._stop_play_api() + + def stop(self) -> None: + self._stop_flag.set() + self._stop_play_api() + + @property + def interrupted(self) -> bool: + return self._stop_flag.is_set() + + @property + def total_sent_sec(self) -> float: + return self._total_sent + + +# ─── AudioIO factory ────────────────────────────────────── + +_PROFILE_ALIASES = { + "builtin": "builtin", + "g1": "builtin", + "g1_builtin": "builtin", +} + +SUPPORTED_PROFILES = ("builtin",) + + +@dataclass +class AudioIO: + mic: Mic + speaker: Speaker + profile_id: str = field(default="builtin") + + def start(self) -> None: + self.mic.start() + + def stop(self) -> None: + try: + self.speaker.stop() + except Exception: + log.warning("AudioIO speaker.stop failed", exc_info=True) + try: + self.mic.stop() + except Exception: + log.warning("AudioIO mic.stop failed", exc_info=True) + + @classmethod + def from_profile( + cls, + profile_id: str, + *, + audio_client: Optional[Any] = None, + ) -> "AudioIO": + """Build an AudioIO for the requested profile. + + `audio_client` is the initialised `unitree_sdk2py` `AudioClient` and + is required for the `builtin` profile (the G1 on-board speaker). + """ + raw = (profile_id or "").strip().lower() + resolved = _PROFILE_ALIASES.get(raw) + if resolved is None: + raise ValueError( + "unknown audio profile {!r}; supported: {}".format( + profile_id, ", ".join(SUPPORTED_PROFILES), + ) + ) + + if resolved == "builtin": + if audio_client is None: + raise ValueError( + "profile 'builtin' requires audio_client (G1 AudioClient)" + ) + return cls( + mic=BuiltinMic(), + speaker=BuiltinSpeaker(audio_client), + profile_id=resolved, + ) + raise AssertionError("unhandled resolved profile: {!r}".format(resolved)) diff --git a/Voice/builtin_mic.py b/Voice/builtin_mic.py index e524169..7b6c868 100644 --- a/Voice/builtin_mic.py +++ b/Voice/builtin_mic.py @@ -1,204 +1,47 @@ """ -builtin_mic.py — G1 built-in microphone (UDP multicast capture) -================================================================ -The G1 humanoid's on-board microphone is published by the Unitree firmware -as an RTP-like UDP multicast stream on 239.168.123.161:5555, carrying -16 kHz mono int16 PCM. Any host on the robot's 192.168.123.0/24 network -can join the group and read the audio — no extra SDK call required. +builtin_mic.py — backward-compat shim. -This module intentionally has no dependency on pyaudio, pulseaudio, or the -unitree_sdk2py package. Joining the multicast group is all that's needed. +The G1 on-board microphone implementation now lives in +[Voice/audio_io.py](Voice/audio_io.py) where it can be paired with the +matching BuiltinSpeaker via `AudioIO.from_profile("builtin", ...)` — +the same structure Sanad uses. -Usage: - from Voice.builtin_mic import BuiltinMic - mic = BuiltinMic() - mic.start() - try: - chunk = mic.read_chunk(1024) # 512 samples, 32 ms at 16 kHz - ... - finally: - mic.stop() +This module exists so existing imports (`from Voice.builtin_mic import +BuiltinMic`) keep working for the non-Gemini voice paths and for +`API/audio_api.py`. It subclasses the canonical `BuiltinMic` and adds +`read_seconds()`, which is used by `API/audio_api.record()`. -Ported from Project/Sanad/voice/audio_io.py (Sanad's production implementation). +Do not add new code here — extend Voice/audio_io.py instead. """ from __future__ import annotations -import os -import socket -import struct -import subprocess -import sys -import threading import time -from typing import Optional -# Load defaults from Config/config_Voice.json::mic_udp so they can be tuned -# without editing code. Falls back to the hardcoded literals below if the -# config isn't reachable (e.g., when imported from a test harness). -_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -if _PROJECT_DIR not in sys.path: - sys.path.insert(0, _PROJECT_DIR) -try: - from Core.config_loader import load_config - _mic_udp = (load_config("Voice") or {}).get("mic_udp", {}) or {} -except Exception: - _mic_udp = {} - -DEFAULT_GROUP = str(_mic_udp.get("group", "239.168.123.161")) -DEFAULT_PORT = int(_mic_udp.get("port", 5555)) -DEFAULT_BUF_MAX = int(_mic_udp.get("buffer_max_bytes", 64_000)) # ~2 s of 16 kHz mono int16 -DEFAULT_READ_TIMEOUT = float(_mic_udp.get("read_timeout_sec", 0.04)) # budget per read_chunk call -SAMPLE_RATE = 16_000 # hardware rate — do not change +from Voice.audio_io import BuiltinMic as _BaseBuiltinMic -def _find_g1_local_ip() -> str: - """ - Return the host IPv4 on the G1's internal 192.168.123.0/24 network. - Required by IP_ADD_MEMBERSHIP so the kernel knows which NIC to join on. - """ - out = subprocess.run( - ["ip", "-4", "-o", "addr"], capture_output=True, text=True, - ).stdout - for line in out.splitlines(): - for tok in line.split(): - if tok.startswith("192.168.123."): - return tok.split("/")[0] - raise RuntimeError( - "BuiltinMic: no interface on 192.168.123.0/24 — " - "host is not on the G1's internal network" - ) - - -class BuiltinMic: - """ - G1 on-board microphone over UDP multicast. - - Thread-safe: a background daemon thread receives datagrams into an - internal ring buffer; `read_chunk(n)` pulls the next `n` bytes or - blocks up to `read_timeout` before returning zeros. - """ - - sample_rate = SAMPLE_RATE - - def __init__( - self, - group: str = DEFAULT_GROUP, - port: int = DEFAULT_PORT, - buf_max: int = DEFAULT_BUF_MAX, - read_timeout: float = DEFAULT_READ_TIMEOUT, - ): - self._group = group - self._port = port - self._buf_max = buf_max - self._read_timeout = read_timeout - self._sock: Optional[socket.socket] = None - self._buf = bytearray() - self._lock = threading.Lock() - self._running = False - self._thread: Optional[threading.Thread] = None - - def start(self) -> None: - if self._running: - return - local_ip = _find_g1_local_ip() - self._sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - self._sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - self._sock.bind(("", self._port)) - mreq = struct.pack( - "4s4s", - socket.inet_aton(self._group), - socket.inet_aton(local_ip), - ) - self._sock.setsockopt(socket.IPPROTO_IP, socket.IP_ADD_MEMBERSHIP, mreq) - self._sock.settimeout(1.0) - self._running = True - self._thread = threading.Thread( - target=self._recv_loop, daemon=True, name="builtin_mic_rx", - ) - self._thread.start() - print(f" [BuiltinMic] joined {self._group}:{self._port} on {local_ip}") - - def _recv_loop(self) -> None: - while self._running: - try: - data, _ = self._sock.recvfrom(4096) - with self._lock: - self._buf.extend(data) - # ring-buffer: drop oldest when we'd exceed buf_max - if len(self._buf) > self._buf_max: - del self._buf[: len(self._buf) - self._buf_max] - except socket.timeout: - continue - except Exception: - if self._running: - time.sleep(0.01) - - def read_chunk(self, num_bytes: int) -> bytes: - """ - Return exactly `num_bytes` of 16 kHz mono int16 PCM. - - Waits up to `read_timeout` for that many bytes to be available. - If the buffer is still short after the timeout, returns whatever - is available padded with silence. Never blocks forever. - """ - deadline = time.time() + self._read_timeout - while time.time() < deadline: - with self._lock: - if len(self._buf) >= num_bytes: - chunk = bytes(self._buf[:num_bytes]) - del self._buf[:num_bytes] - return chunk - time.sleep(0.003) - with self._lock: - avail = len(self._buf) - if avail > 0: - chunk = bytes(self._buf[:avail]) - del self._buf[:avail] - return chunk + b"\x00" * (num_bytes - avail) - return b"\x00" * num_bytes +class BuiltinMic(_BaseBuiltinMic): + """G1 on-board mic + `read_seconds()` convenience.""" def read_seconds(self, seconds: float) -> bytes: - """ - Convenience: capture `seconds` of audio and return as bytes. - Blocks for the full duration (not a real-time producer). - """ - num_bytes = int(seconds * self.sample_rate * 2) # 2 bytes/sample (int16) + """Capture `seconds` of audio and return as bytes.""" + num_bytes = int(seconds * self.sample_rate * 2) out = bytearray() chunk_bytes = 1024 while len(out) < num_bytes: out.extend(self.read_chunk(min(chunk_bytes, num_bytes - len(out)))) return bytes(out) - def flush(self) -> None: - """Drop all buffered audio (e.g. after the robot spoke).""" - with self._lock: - self._buf.clear() - def stop(self) -> None: - self._running = False - if self._sock is not None: - try: - self._sock.close() - except Exception: - pass - self._sock = None - if self._thread is not None: - self._thread.join(timeout=1.5) - self._thread = None - - -# ──────────────────────────────────────────────────────────────── # Standalone test — capture 3 s and print energy stats -# ──────────────────────────────────────────────────────────────── - if __name__ == "__main__": import array print("BuiltinMic standalone test — capturing 3 s from G1...") mic = BuiltinMic() mic.start() - time.sleep(0.3) # let the receiver thread warm up + time.sleep(0.3) raw = mic.read_seconds(3.0) mic.stop() @@ -212,4 +55,4 @@ if __name__ == "__main__": if mean_abs > 30: print(" OK — mic is capturing audio") else: - print(" WARN — signal very low, check G1 audio service is running") + print(" WARN — signal very low") diff --git a/Voice/gemini_runner.py b/Voice/gemini_runner.py new file mode 100644 index 0000000..cea756e --- /dev/null +++ b/Voice/gemini_runner.py @@ -0,0 +1,440 @@ +#!/usr/bin/env python3 +"""Voice/gemini_runner.py — Gemini Live STT subprocess. + +Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so it +can import `google-genai`, which doesn't support Python 3.8. The marcus env +itself is pinned to Python 3.8 by the NVIDIA Jetson torch wheel, so Gemini +has to live in its own process — the same pattern Sanad uses. + +The marcus parent process spawns this script via: + + /path/to/gemini_sdk/python -u Voice/gemini_runner.py + +and parses the JSON-lines stream we emit on stdout. The parent never sees +audio bytes — this script owns the mic, the Gemini WebSocket, AND the WAV +recording, so the IPC boundary stays narrow (just transcripts). + +──────────────────────────────────────────────────────────────────────── +Stdout protocol (one JSON object per line, UTF-8): + {"type":"ready"} session connected, mic is live + {"type":"user", "text":"..."} user input transcription + {"type":"bot", "text":"..."} Gemini's text reply (logged only — never spoken) + {"type":"turn_end"} Gemini emitted turn_complete + {"type":"reconnect", "reason":"..."} session ended, will reconnect + {"type":"log", "level":"info|warn|error", "msg":"..."} + +Stdin protocol (line-based): + "stop\n" request graceful shutdown + +Exit codes: + 0 — clean shutdown after "stop" or signal + 2 — google-genai not importable + 3 — no API key + 4 — fatal session loop crash +──────────────────────────────────────────────────────────────────────── + +Env vars: + MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY) — required + MARCUS_GEMINI_MODEL (optional) — model id + MARCUS_GEMINI_VOICE (optional, ignored in TEXT mode) + MARCUS_PROJECT_ROOT (optional) — for sys.path + +This file uses Python 3.10+ syntax — type unions with `|`, etc. — because +the gemini_sdk env is 3.10+. DO NOT try to import it from marcus 3.8. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import signal +import sys +import threading +import time +from typing import Any + +import numpy as np + +# Make the Marcus project importable so we can reuse Voice/audio_io.py and +# Voice/turn_recorder.py (both pure-stdlib + numpy, no Python-version traps). +_PROJECT_ROOT = ( + os.environ.get("MARCUS_PROJECT_ROOT") + or os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) +if _PROJECT_ROOT not in sys.path: + sys.path.insert(0, _PROJECT_ROOT) + +from Voice.audio_io import BuiltinMic +from Voice.turn_recorder import TurnRecorder + +try: + from Core.config_loader import load_config + _VCFG = load_config("Voice") or {} +except Exception: + _VCFG = {} + +_STT = _VCFG.get("stt", {}) + + +# ─── stdout / stderr helpers ────────────────────────────────────── + +_stdout_lock = threading.Lock() + + +def emit(payload: dict) -> None: + """Write one JSON line to stdout. Thread-safe + flushed.""" + line = json.dumps(payload, ensure_ascii=False, separators=(",", ":")) + with _stdout_lock: + sys.stdout.write(line + "\n") + sys.stdout.flush() + + +def log(level: str, msg: str) -> None: + """Send a log line to the parent (parent forwards to logs/voice.log).""" + emit({"type": "log", "level": level, "msg": msg}) + + +# ─── stdin watcher (graceful shutdown) ──────────────────────────── + + +_STOP_REQUESTED = threading.Event() +_MIC_HOLDER: list = [] # length-≤1 list — holds the active BuiltinMic + + +def _stdin_watcher() -> None: + try: + for line in sys.stdin: + cmd = line.strip().lower() + if cmd == "stop": + log("info", "stop received from parent — exiting") + _STOP_REQUESTED.set() + return + elif cmd == "flush": + # Parent asks us to drop buffered mic audio (e.g. before + # TtsMaker plays a reply, so the robot's own voice doesn't + # come back as a fake user utterance). + if _MIC_HOLDER: + try: + _MIC_HOLDER[0].flush() + except Exception: + pass + except Exception: + return + + +threading.Thread(target=_stdin_watcher, daemon=True, name="stdin-watcher").start() + + +def _install_signal_handlers() -> None: + def _handle(_signum, _frame): + log("info", "signal received — exiting") + _STOP_REQUESTED.set() + for sig in (signal.SIGTERM, signal.SIGINT): + try: + signal.signal(sig, _handle) + except Exception: + pass + + +# ─── tunables (mirrors Voice/gemini_script.py reads) ────────────── + +_MODEL = os.environ.get( + "MARCUS_GEMINI_MODEL", + _STT.get("gemini_model", "gemini-2.5-flash-native-audio-preview-12-2025"), +) +_DEFAULT_VOICE = os.environ.get( + "MARCUS_GEMINI_VOICE", + _STT.get("gemini_voice_name", "Charon"), +) + +_API_KEY = ( + os.environ.get("MARCUS_GEMINI_API_KEY") + or os.environ.get("SANAD_GEMINI_API_KEY") + or _STT.get("gemini_api_key", "") +) + +_MIC_GAIN = float(_STT.get("mic_gain", 1.0)) +_SESSION_TIMEOUT = float(_STT.get("gemini_session_timeout_sec", 660)) +_MAX_RECONNECT_DELAY = float(_STT.get("gemini_max_reconnect_delay_sec", 30)) +_MAX_CONSECUTIVE_ERRORS = int(_STT.get("gemini_max_consecutive_errors", 10)) +_NO_MESSAGES_TIMEOUT = float(_STT.get("gemini_no_messages_timeout_sec", 30)) + +SEND_SAMPLE_RATE = int(_STT.get("gemini_send_sample_rate", 16000)) +CHUNK_SIZE = int(_STT.get("gemini_chunk_size", 512)) +_CHUNK_BYTES = CHUNK_SIZE * 2 + +_REC_ENABLED = bool(_STT.get("gemini_record_enabled", True)) +_RECV_RATE = int(_STT.get("gemini_receive_sample_rate", 24000)) +_DATA_DIR = os.path.join( + _PROJECT_ROOT, + _VCFG.get("audio", {}).get("data_dir", "Data/Voice/Recordings"), + "gemini_turns", +) + +_SYS_PROMPT = _STT.get( + "gemini_system_prompt", + "Transcribe what the user says to Sanad. Stay silent.", +) +_SP_FILE = _STT.get("gemini_system_prompt_file", "") +if _SP_FILE: + _sp_path = ( + _SP_FILE if os.path.isabs(_SP_FILE) + else os.path.join(_PROJECT_ROOT, _SP_FILE) + ) + try: + with open(_sp_path, "r", encoding="utf-8") as f: + txt = f.read().strip() + if txt: + _SYS_PROMPT = txt + except Exception: + pass + + +# ─── main async loop ────────────────────────────────────────────── + + +def _build_config(types): + vad_start = _STT.get("gemini_vad_start_sensitivity", "START_SENSITIVITY_HIGH") + vad_end = _STT.get("gemini_vad_end_sensitivity", "END_SENSITIVITY_LOW") + prefix_ms = int(_STT.get("gemini_vad_prefix_padding_ms", 20)) + silence_ms = int(_STT.get("gemini_vad_silence_duration_ms", 200)) + + return types.LiveConnectConfig( + response_modalities=["TEXT"], + realtime_input_config=types.RealtimeInputConfig( + automatic_activity_detection=types.AutomaticActivityDetection( + disabled=False, + start_of_speech_sensitivity=getattr(types.StartSensitivity, vad_start), + end_of_speech_sensitivity=getattr(types.EndSensitivity, vad_end), + prefix_padding_ms=prefix_ms, + silence_duration_ms=silence_ms, + ), + ), + input_audio_transcription=types.AudioTranscriptionConfig(), + system_instruction=types.Content( + parts=[types.Part(text=_SYS_PROMPT)], + ), + ) + + +async def _send_mic_loop(session, types_mod, mic, recorder, done: asyncio.Event) -> None: + loop = asyncio.get_event_loop() + frame_pause = CHUNK_SIZE / float(SEND_SAMPLE_RATE) + last_activity = time.time() + + while not done.is_set() and not _STOP_REQUESTED.is_set(): + try: + raw = await loop.run_in_executor(None, mic.read_chunk, _CHUNK_BYTES) + except Exception as e: + log("warn", f"mic read failed: {e}") + break + + if not raw: + await asyncio.sleep(frame_pause) + continue + + if _MIC_GAIN != 1.0: + samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) + samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16) + raw = samples.tobytes() + + # Per-turn user-audio capture for the WAV recorder. We don't have + # Gemini's "is the AI speaking" flag (no audio out), so capture + # whenever we have meaningful energy. + try: + samples_view = np.frombuffer(raw, dtype=np.int16) + if samples_view.size and int(np.abs(samples_view).max()) > 250: + recorder.capture_user(raw) + except Exception: + pass + + now = time.time() + if now - last_activity > 10: + log("info", f"alive (idle {now - last_activity:.0f}s)") + last_activity = now + + try: + await session.send_realtime_input( + audio=types_mod.Blob( + data=raw, + mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}", + ), + ) + except asyncio.CancelledError: + return + except Exception as e: + log("warn", f"mic send failed: {e}") + done.set() + return + + await asyncio.sleep(frame_pause) + + +async def _receive_loop(session, recorder, done: asyncio.Event) -> None: + last_recv = time.time() + try: + while not done.is_set() and not _STOP_REQUESTED.is_set(): + async for response in session.receive(): + last_recv = time.time() + if done.is_set(): + break + + if (hasattr(response, "go_away") + and getattr(response, "go_away", None) is not None): + emit({"type": "reconnect", "reason": "server go_away"}) + done.set() + return + + sc = getattr(response, "server_content", None) + if sc is None: + continue + + it = getattr(sc, "input_transcription", None) + if it is not None: + text = (getattr(it, "text", "") or "").strip() + if text: + emit({"type": "user", "text": text}) + try: + recorder.add_user_text(text) + except Exception: + pass + + mt = getattr(sc, "model_turn", None) + if mt is not None: + for part in getattr(mt, "parts", []) or []: + txt = getattr(part, "text", None) + if txt: + txt = txt.strip() + if txt: + emit({"type": "bot", "text": txt}) + try: + recorder.add_robot_text(txt) + except Exception: + pass + + if getattr(sc, "turn_complete", False): + try: + recorder.finish_turn() + except Exception: + pass + emit({"type": "turn_end"}) + + if time.time() - last_recv > _NO_MESSAGES_TIMEOUT: + log("warn", f"no messages from Gemini for {_NO_MESSAGES_TIMEOUT:.0f}s") + break + await asyncio.sleep(0.1) + except asyncio.CancelledError: + return + except Exception as e: + log("warn", f"receive ended: {e}") + finally: + done.set() + + +async def main_async() -> int: + if not _API_KEY: + log("error", "no Gemini API key (set MARCUS_GEMINI_API_KEY)") + return 3 + + try: + from google import genai + from google.genai import types + except Exception as e: + log("error", f"google-genai not importable: {e}") + return 2 + + try: + client = genai.Client(api_key=_API_KEY) + except Exception as e: + log("error", f"failed to create Gemini client: {e}") + return 4 + + config = _build_config(types) + mic = BuiltinMic() + mic.start() + _MIC_HOLDER.append(mic) # expose to the stdin "flush" watcher + + recorder = TurnRecorder( + enabled=_REC_ENABLED, + out_dir=_DATA_DIR, + user_rate=SEND_SAMPLE_RATE, + robot_rate=_RECV_RATE, + ) + + session_num = 0 + consecutive_errors = 0 + start = time.time() + rc = 0 + + try: + while not _STOP_REQUESTED.is_set(): + session_num += 1 + uptime_min = (time.time() - start) / 60 + try: + log("info", f"connecting (session #{session_num}, uptime {uptime_min:.0f}m)") + async with client.aio.live.connect(model=_MODEL, config=config) as session: + emit({"type": "ready"}) + consecutive_errors = 0 + mic.flush() + done = asyncio.Event() + try: + await asyncio.wait_for( + asyncio.gather( + _send_mic_loop(session, types, mic, recorder, done), + _receive_loop(session, recorder, done), + ), + timeout=_SESSION_TIMEOUT, + ) + except asyncio.TimeoutError: + log("info", f"session timed out after {_SESSION_TIMEOUT:.0f}s") + except asyncio.CancelledError: + pass + + log("info", f"session #{session_num} ended — reconnecting in 1s") + try: + mic.flush() + except Exception: + pass + if _STOP_REQUESTED.is_set(): + break + await asyncio.sleep(1) + except asyncio.CancelledError: + break + except Exception as e: + consecutive_errors += 1 + delay = min(_MAX_RECONNECT_DELAY, 2 ** consecutive_errors) + log("error", f"session error #{consecutive_errors}: {e} — retry in {delay:.0f}s") + try: + await asyncio.sleep(delay) + except asyncio.CancelledError: + break + if consecutive_errors >= _MAX_CONSECUTIVE_ERRORS: + log("warn", f"{consecutive_errors} consecutive errors — recreating client") + try: + client = genai.Client(api_key=_API_KEY) + consecutive_errors = 0 + except Exception as ce: + log("error", f"client recreation failed: {ce}") + finally: + try: + mic.stop() + except Exception: + pass + + return rc + + +def main() -> int: + _install_signal_handlers() + try: + return asyncio.run(main_async()) + except KeyboardInterrupt: + return 0 + except Exception as e: + log("error", f"fatal: {e}") + return 4 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/Voice/gemini_script.py b/Voice/gemini_script.py new file mode 100644 index 0000000..e30c5e2 --- /dev/null +++ b/Voice/gemini_script.py @@ -0,0 +1,299 @@ +"""Voice/gemini_script.py — subprocess manager for Gemini Live STT. + +Runs in marcus's Python 3.8 env. The actual Gemini STT lives in +[Voice/gemini_runner.py](Voice/gemini_runner.py) which has to run in a +Python 3.10+ env (e.g. the `gemini_sdk` conda env on the Jetson) because +`google-genai` doesn't support Python 3.8. + +This file spawns the runner as a subprocess, reads JSON-line transcripts +off its stdout, and turns them into the same `on_transcript` / `on_command` +callbacks the rest of marcus expects. The external API of class +`GeminiBrain` is unchanged from the previous in-process port — drop-in +swap for `Voice/marcus_voice.py::_voice_loop_gemini`. + +Sanad uses the same subprocess pattern (its own `live_voice_loop.py` +parses log lines from a Gemini subprocess), so this matches Sanad's +architecture not just in mechanism but in shape. + +──────────────────────────────────────────────────────────────────────── +Subprocess lookup order for the Python 3.10+ binary: + 1. env MARCUS_GEMINI_PYTHON (highest priority) + 2. config stt.gemini_python_path + 3. auto-detect — try a list of common conda env paths + 4. raise — explicit error in voice.log +──────────────────────────────────────────────────────────────────────── +""" + +from __future__ import annotations + +import json +import logging +import os +import subprocess +import sys +import threading +from typing import Callable, Optional + +log = logging.getLogger("gemini_brain") + + +# Candidate conda-env paths for the Python 3.10+ binary. Override with +# MARCUS_GEMINI_PYTHON or stt.gemini_python_path if the env lives elsewhere. +_DEFAULT_CANDIDATES = [ + "~/miniconda3/envs/gemini_sdk/bin/python", + "~/anaconda3/envs/gemini_sdk/bin/python", + "~/.miniconda3/envs/gemini_sdk/bin/python", + "/opt/conda/envs/gemini_sdk/bin/python", + "~/miniconda3/envs/sanad/bin/python", + "~/anaconda3/envs/sanad/bin/python", +] + + +def _resolve_runner_python(stt_cfg: dict) -> str: + """Find the Python 3.10+ binary that can import google-genai.""" + explicit = ( + os.environ.get("MARCUS_GEMINI_PYTHON") + or stt_cfg.get("gemini_python_path", "") + ) + if explicit: + path = os.path.expanduser(explicit) + if os.path.isfile(path) and os.access(path, os.X_OK): + return path + raise FileNotFoundError( + "MARCUS_GEMINI_PYTHON / stt.gemini_python_path = " + "{!r} but that binary does not exist or is not executable".format(path) + ) + for cand in _DEFAULT_CANDIDATES: + path = os.path.expanduser(cand) + if os.path.isfile(path) and os.access(path, os.X_OK): + log.info("auto-detected gemini-runner python at %s", path) + return path + raise FileNotFoundError( + "no Python 3.10+ env found for the Gemini runner. Set env " + "MARCUS_GEMINI_PYTHON to the path of a conda env's python with " + "`google-genai` installed (e.g. ~/miniconda3/envs/gemini_sdk/bin/python)." + ) + + +class GeminiBrain: + """Subprocess-managing wrapper around Voice/gemini_runner.py. + + External API kept identical to the in-process version so callers don't + care that Gemini lives in another Python: + + brain = GeminiBrain(audio_io, recorder, voice_name, system_prompt, + api_key=..., on_transcript=cb1, on_command=cb2) + brain.start() + ... + brain.stop() + + `audio_io` and `recorder` are accepted for API parity but unused — + the subprocess owns its own mic and writes its own WAVs (one process + owning the whole audio path is simpler than streaming PCM over a pipe). + """ + + def __init__( + self, + audio_io, # ignored (runner owns its own) + recorder, # ignored (runner owns its own) + voice_name=None, # forwarded via env + system_prompt="", # forwarded via env (or config) + *, + api_key: str = "", + on_transcript: Optional[Callable[[str], None]] = None, + on_command: Optional[Callable[[str, str], None]] = None, + ): + self._voice_name = voice_name or "" + self._system_prompt = system_prompt or "" + self._api_key = api_key + self._on_transcript = on_transcript + self._on_command = on_command + + self._proc = None # type: Optional[subprocess.Popen] + self._reader_thread = None # type: Optional[threading.Thread] + self._err_thread = None # type: Optional[threading.Thread] + self._stopping = False + + # config-loaded lazily so import order doesn't matter + try: + from Core.config_loader import load_config + cfg = load_config("Voice") or {} + except Exception: + cfg = {} + self._stt = cfg.get("stt", {}) + + # ─── lifecycle ──────────────────────────────────────── + + def start(self) -> None: + if self._proc is not None and self._proc.poll() is None: + log.warning("GeminiBrain subprocess already running") + return + self._stopping = False + + try: + python_bin = _resolve_runner_python(self._stt) + except FileNotFoundError as e: + log.error("%s", e) + return + + runner = os.path.abspath( + os.path.join(os.path.dirname(__file__), "gemini_runner.py") + ) + if not os.path.isfile(runner): + log.error("gemini_runner.py not found at %s", runner) + return + + env = os.environ.copy() + if self._api_key: + env["MARCUS_GEMINI_API_KEY"] = self._api_key + if self._voice_name: + env["MARCUS_GEMINI_VOICE"] = self._voice_name + # Forward the system prompt via env so the runner doesn't have to + # re-read the JSON file (and so a trimmed inline string survives). + if self._system_prompt: + env["MARCUS_GEMINI_SYSTEM_PROMPT"] = self._system_prompt + env["MARCUS_PROJECT_ROOT"] = os.path.dirname(os.path.dirname(runner)) + + log.info("spawning gemini runner: %s -u %s", python_bin, runner) + try: + self._proc = subprocess.Popen( + [python_bin, "-u", runner], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.dirname(runner)), + env=env, + bufsize=1, + universal_newlines=True, + ) + except Exception as e: + log.error("failed to spawn gemini runner: %s", e) + self._proc = None + return + + self._reader_thread = threading.Thread( + target=self._stdout_reader, daemon=True, name="gemini-stdout", + ) + self._reader_thread.start() + self._err_thread = threading.Thread( + target=self._stderr_reader, daemon=True, name="gemini-stderr", + ) + self._err_thread.start() + + def flush_mic(self) -> None: + """ + Tell the runner subprocess to drop its buffered mic audio. + + Used before AND after the brain speaks via TtsMaker so the robot's + own voice (which the mic picks up during playback) doesn't come + back from Gemini as a fake user utterance and accidentally hit + the wake-word gate. + """ + proc = self._proc + if proc is None or proc.stdin is None: + return + try: + if not proc.stdin.closed: + proc.stdin.write("flush\n") + proc.stdin.flush() + except Exception: + pass + + def stop(self) -> None: + self._stopping = True + proc = self._proc + if proc is None: + return + # Polite stop: send "stop\n" on stdin, then wait briefly, then SIGTERM. + try: + if proc.stdin and not proc.stdin.closed: + try: + proc.stdin.write("stop\n") + proc.stdin.flush() + except Exception: + pass + except Exception: + pass + try: + proc.wait(timeout=3) + except Exception: + try: + proc.terminate() + except Exception: + pass + try: + proc.wait(timeout=2) + except Exception: + try: + proc.kill() + except Exception: + pass + self._proc = None + + # ─── stdout / stderr drainers ───────────────────────── + + def _stdout_reader(self) -> None: + proc = self._proc + if proc is None or proc.stdout is None: + return + for line in proc.stdout: + if self._stopping: + break + line = line.strip() + if not line: + continue + try: + msg = json.loads(line) + except Exception: + # Non-JSON line — log it raw so we can debug runner crashes. + log.warning("gemini-runner stdout (non-JSON): %s", line[:200]) + continue + self._handle_msg(msg) + + def _stderr_reader(self) -> None: + proc = self._proc + if proc is None or proc.stderr is None: + return + for line in proc.stderr: + line = line.rstrip() + if line: + log.warning("gemini-runner stderr: %s", line[:200]) + + def _handle_msg(self, msg: dict) -> None: + t = msg.get("type") + if t == "user": + text = (msg.get("text") or "").strip() + if not text: + return + log.info("USER: %s", text) + if self._on_transcript is not None: + try: + self._on_transcript(text) + except Exception as e: + log.error("on_transcript failed: %s", e) + if self._on_command is not None: + try: + self._on_command(text, "en") + except Exception as e: + log.error("on_command failed: %s", e) + elif t == "bot": + txt = (msg.get("text") or "").strip() + if txt: + log.info("GEMINI: %s", txt[:120]) + elif t == "turn_end": + log.info("listening") + elif t == "ready": + log.info("connected — listening for speech") + elif t == "reconnect": + log.info("server signalled reconnect: %s", msg.get("reason", "")) + elif t == "log": + level = msg.get("level", "info") + text = msg.get("msg", "") + if level == "error": + log.error("[runner] %s", text) + elif level == "warn": + log.warning("[runner] %s", text) + else: + log.info("[runner] %s", text) + else: + log.debug("gemini-runner unknown type=%r: %s", t, msg) diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py index 59651ea..2a07953 100644 --- a/Voice/marcus_voice.py +++ b/Voice/marcus_voice.py @@ -1,38 +1,45 @@ #!/usr/bin/env python3 """ -Voice/marcus_voice.py — voice input for Marcus (custom wake + faster-whisper STT). +Voice/marcus_voice.py — Gemini Live voice orchestrator for Marcus. Pipeline: - G1 mic ─► custom wake detector (numpy, offline, instant) - │ - ▼ - TTS "Yes" (AudioAPI → G1 TtsMaker) - │ - ▼ - record command audio until silence - │ - ▼ - faster-whisper base.en int8 (CPU) ──► brain callback(text) -Wake detection is local and instant (Voice/wake_detector.py — pure DSP, no -ML). STT runs only on the recorded command, not on every 2 s of mic input, -so the CPU cost is bounded by how often the user talks. + G1 mic UDP ──► BuiltinMic (Voice/audio_io.py) + │ + ▼ + GeminiBrain (Voice/gemini/script.py) + │ audio out (24 kHz) + ▼ + BuiltinSpeaker (Voice/audio_io.py) ──► G1 speaker + │ user transcript (on_command) + ▼ + _dispatch_gemini_command + - require wake word "Sanad" + - fuzzy-match command_vocab + - dedup within command_cooldown_sec + │ + ▼ + on_command(text, "en") ──► Marcus brain -Why faster-whisper (CTranslate2) instead of openai-whisper: - The Jetson's torch-aarch64 build has a Categorical sampler bug that - produces NaN logits on low-SNR input, which is exactly what the G1 - far-field mic captures. faster-whisper bypasses torch entirely and - runs the int8-quantized model through CTranslate2 — same quality as - Whisper base, no numerical instability, 3× faster on this hardware. +Gemini owns both STT and TTS — it hears the user and replies with its own +voice. Marcus's on_command hook fires alongside Gemini's verbal reply so +motion commands (\"Sanad, turn right\") still move the robot body while +the conversation flows naturally. + +Wake word is enforced at dispatch only — Gemini chats normally on all +speech; the robot moves only when \"Sanad\" + a recognised action phrase +appears in the transcript. """ from __future__ import annotations import logging import os +import re import sys import threading import time +from difflib import SequenceMatcher from logging.handlers import RotatingFileHandler from typing import Callable, Optional @@ -61,14 +68,12 @@ log = logging.getLogger("marcus_voice") # ── Transcript log ───────────────────────────────────────────── -# Every transcribed utterance (wake or not, command or not) is -# written here in a simple one-line-per-entry format so the operator -# can scan everything the mic heard without wading through the full -# voice.log. Rotates every 5 MB × 3 backups. +# Every user transcript Gemini emits is written here in a simple +# one-line-per-entry format. Rotates every 5 MB × 3 backups. _TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log") _transcript_log = logging.getLogger("transcript") _transcript_log.setLevel(logging.INFO) -_transcript_log.propagate = False # don't double-emit +_transcript_log.propagate = False if not _transcript_log.handlers: _th = RotatingFileHandler( _TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8", @@ -78,21 +83,11 @@ if not _transcript_log.handlers: def _log_transcript(action: str, text: str) -> None: - """Write one line to logs/transcript.log. - action: 'HEARD' / 'WAKE' / 'CMD' / 'UNK' / ... - """ _transcript_log.info("%-5s %s", action, (text or "").strip()) -# Module-level vocabulary containers. EMPTY on import — populated by -# VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words, -# command_vocab, garbage_patterns}. Config is the single source of truth; -# there are no hardcoded string lists here anymore. -# -# If you import this module without running a VoiceModule() first, these -# stay empty → fuzzy-match is a no-op, wake detection rejects everything, -# garbage filter rejects nothing. That's by design: bad config = obvious -# broken behavior, not silently-drifting hardcoded defaults. +# Module-level vocabulary — populated from Config/config_Voice.json::stt. +# Used by the wake-word gate and the fuzzy-match command normalizer. WAKE_WORDS: set = set() COMMAND_VOCAB: list = [] GARBAGE_PATTERNS: set = set() @@ -100,12 +95,7 @@ _MIN_TRANSCRIPTION_LENGTH: int = 3 def _has_wake_word(text: str) -> bool: - """ - True if the utterance contains any wake-word variant as a *whole word* - (word-boundary match, not substring — so "standard" doesn't trigger - off "sand"). - """ - import re + """True if `text` contains any wake-word variant as a whole word.""" low = text.lower() for w in WAKE_WORDS: if re.search(r'\b' + re.escape(w) + r'\b', low): @@ -115,17 +105,10 @@ def _has_wake_word(text: str) -> bool: def _strip_wake_word_once(text: str) -> str: """Single pass of wake-word stripping. Use via _strip_wake_word().""" - import re stripped = text.strip() - - # Case 1: the entire utterance is just a wake word + optional - # trailing punctuation. Return empty string so caller can ack-only. for w in WAKE_WORDS: if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE): return "" - - # Case 2: "Sanad " — require whitespace (or comma+ws) between - # wake word and command so "Sanad." doesn't swallow "." as a command. for w in sorted(WAKE_WORDS, key=len, reverse=True): m = re.match( rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$', @@ -133,35 +116,21 @@ def _strip_wake_word_once(text: str) -> str: ) if m: return m.group(1).strip(' ,.!?') - - # Case 3: " Sanad" — trailing wake word. m = re.match( rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$', text, re.IGNORECASE, ) if m: return m.group(1).strip(' ,.!?') - return text def _strip_wake_word(text: str) -> str: """ - Remove the wake word from the start or end of text, iteratively, - so repeated-wake transcriptions ("Sanad. Sanad.") fully collapse - to the actual command (or empty string if nothing else was said). - - Examples: - "Sanad, turn left" → "turn left" - "Sanad turn left" → "turn left" - "turn left Sanad" → "turn left" - "Sanad." → "" - "Sanad" → "" - "Sanad. Sanad." → "" (was leaving "Sanad" before) - "Sanad Sanad stop" → "stop" (recursive strip) + Remove the wake word from the start or end of text, iteratively, so + repeated-wake transcriptions ("Sanad. Sanad.") fully collapse. + Capped at 5 passes to prevent pathological inputs from looping. """ - # Iterate until stable — each pass peels off one wake word. Cap at - # a handful of iterations so a malicious/garbled input can't loop. for _ in range(5): stripped = _strip_wake_word_once(text) if stripped == text: @@ -172,25 +141,14 @@ def _strip_wake_word(text: str) -> str: def _closest_command(text: str, cutoff: float = 0.72) -> str: """ - Map a Whisper transcription to the closest known command phrase. - + Map a transcription to the closest known command phrase. Returns the canonical command if there's a close-enough match, else - returns the original text unchanged. Close = difflib SequenceMatcher - ratio ≥ cutoff (0.72 empirically rejects unrelated phrases while - accepting common Whisper near-misses like "Turn right up"→"turn right" - or "What do you see?"→"what do you see"). - - Also handles the "transcription contains a command" case — if the - text has a command phrase as a substring (e.g. "Sanad, turn left" - from an echo), extract the command. + returns the original text unchanged. """ - from difflib import SequenceMatcher low = text.lower().strip().rstrip(".!?,") if not low: return text - # Cheap substring win first — no fuzzy needed if the command is - # literally in the transcription. for cmd in COMMAND_VOCAB: if cmd in low: return cmd @@ -209,6 +167,8 @@ def _closest_command(text: str, cutoff: float = 0.72) -> str: class VoiceModule: + """Thin orchestrator around GeminiBrain + command dispatch.""" + def __init__( self, audio_api, @@ -223,1088 +183,208 @@ class VoiceModule: self._stt = self._config.get("stt", {}) self._messages = self._config.get("messages", {}) - # Load all voice vocabulary from config — these are the only - # string lists the voice layer uses, and they come from - # config_Voice.json. If a key is missing, the list is empty and - # that feature silently degrades (fuzzy-match no-op, nothing - # rejected as garbage, no wake-word match) — NEVER crashes. + # Load vocab from config — single source of truth. global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH WAKE_WORDS = {w.lower() for w in self._stt.get("wake_words", [])} COMMAND_VOCAB = list(self._stt.get("command_vocab", [])) GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])} _MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3)) self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72)) - log.info("vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns", - len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS)) - - # ── Custom wake detector ── - from Voice.wake_detector import WakeDetector, WakeConfig - wcfg = WakeConfig( - sample_rate = 16_000, - speech_threshold = float(self._stt.get("speech_threshold", 80.0)), - min_word_duration_s = float(self._stt.get("min_word_duration", 0.20)), - max_word_duration_s = float(self._stt.get("max_word_duration", 1.50)), - post_silence_s = float(self._stt.get("post_silence", 0.30)), - cooldown_s = float(self._stt.get("wake_cooldown", 1.50)), - chunk_ms = int( self._stt.get("wake_chunk_ms", 50)), - adaptive_window_n = int( self._stt.get("wake_adaptive_window_n", 50)), - adaptive_mult = float(self._stt.get("wake_adaptive_mult", 3.0)), - diag_log_sec = float(self._stt.get("wake_diag_log_sec", 3.0)), + log.info( + "vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns", + len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS), ) - self._detector = WakeDetector(wcfg) - # ── G1 mic ── - from Voice.builtin_mic import BuiltinMic - _mcfg = self._config.get("mic_udp", {}) - self._mic_capture = BuiltinMic( - group = _mcfg.get("group", "239.168.123.161"), - port = _mcfg.get("port", 5555), - buf_max = _mcfg.get("buffer_max_bytes", 64000), - ) - self._sample_rate = self._mic_capture.sample_rate + # Dispatch dedup state: Gemini's input_transcription can fire + # multiple times per turn (streaming partials). Track the last + # canonical command + timestamp so we don't move twice. + self._last_gemini_canon = "" + self._last_gemini_dispatch_at = 0.0 - # ── global software mic gain ── - # Applied to every byte read from the mic, so wake detector, VAD, - # AND Whisper all see the boosted audio. One knob, uniform effect. - # G1 far-field mic benefits from 2.0-3.0 for normal speaking volume; - # above 4.0 you start clipping loud words. - self._mic_gain = float(self._stt.get("mic_gain", 1.0)) - if self._mic_gain != 1.0: - log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain) - - # ── STT backend selection ── - # "faster_whisper" (default): Whisper base.en int8 on CPU via CTranslate2. - # "moonshine": useful-sensors Moonshine via moonshine-voice - # + onnxruntime. Different training, different - # error profile from Whisper — useful when - # Whisper's hallucinations (short "Yes.", - # "Bye.", "It.") are the failure mode. - self._backend_name = str(self._stt.get("backend", "faster_whisper")).lower() - - # Lazy-init handles — concrete model loads on first wake so startup - # stays light. `False` marks a failed init so we don't keep retrying. - self._fw = None - self._moonshine = None - - # ── Two-turn wake state (always_on_gated mode) ── - # self._awaiting_command: False = listening for wake. - # True = wake heard, next utterance is - # the command. Cleared after the - # command dispatches or after - # await_command_timeout_sec seconds - # so a stray "Sanad" doesn't arm - # forever. - self._awaiting_command = False - self._await_deadline = 0.0 + # Gemini brain reference for flush_mic() — populated by + # _voice_loop_gemini after spawning the runner subprocess. + self._brain = None self._running = False self._thread = None - self._cooldown_until = 0.0 - log.info("VoiceModule initialized (wake=custom, stt=%s)", self._backend_name) - # ─── gain-applied mic read ──────────────────────────── - - def _read_mic_raw(self, num_bytes: int) -> bytes: - """Raw mic read — no gain. Used by the wake detector whose - thresholds are calibrated against unamplified G1 ambient.""" - return self._mic_capture.read_chunk(num_bytes) - - def _read_mic_gained(self, num_bytes: int) -> bytes: - """ - Mic read with self._mic_gain applied. Used during command - recording so Whisper sees a louder, cleaner signal. NOT used - in the wake loop — amplifying ambient there pushes it over - the wake threshold and the detector can never find its - silent baseline. - """ - raw = self._mic_capture.read_chunk(num_bytes) - if not raw or self._mic_gain == 1.0: - return raw - arr = np.frombuffer(raw, dtype=np.int16).astype(np.float32) * self._mic_gain - return np.clip(arr, -32768, 32767).astype(np.int16).tobytes() - - # ─── lazy faster-whisper init ───────────────────────── - - def _get_fw(self): - """Load faster-whisper on first use — startup saved for cold path.""" - if self._fw is not None: - return self._fw - model = self._stt.get("whisper_model", "base.en") - device = self._stt.get("whisper_device", "cpu") - compute = self._stt.get("whisper_compute_type", "int8") - log.info( - "Loading faster-whisper: model=%s device=%s compute=%s", - model, device, compute, - ) - try: - from faster_whisper import WhisperModel - self._fw = WhisperModel(model, device=device, compute_type=compute) - log.info("faster-whisper ready") - except Exception as e: - log.error("faster-whisper init failed: %s — voice will be wake-only", e) - self._fw = None - return self._fw - - # ─── lazy moonshine init ────────────────────────────── - - def _get_moonshine(self): - """ - Load Moonshine (useful-sensors) on first use. Requires - `pip install moonshine-voice` on the target. Returns None if - the package isn't available — caller should fall back. - """ - if self._moonshine is not None: - return self._moonshine if self._moonshine is not False else None - lang = self._stt.get("moonshine_language", "en") - log.info("Loading Moonshine: language=%s", lang) - try: - from moonshine_voice import Transcriber - from moonshine_voice.download import download_model - model_path, model_arch = download_model(language=lang) - self._moonshine = Transcriber( - model_path=model_path, model_arch=model_arch, - ) - log.info("Moonshine ready: arch=%s", model_arch) - except Exception as e: - log.error("Moonshine init failed: %s — voice will be wake-only", e) - self._moonshine = False - return None - return self._moonshine - - def _moonshine_decode(self, audio_f32: np.ndarray) -> str: - """Run Moonshine one-shot on a float32 16kHz mono array. Returns ''.""" - m = self._get_moonshine() - if m is None: - return "" - try: - result = m.transcribe_without_streaming( - audio_data=audio_f32, sample_rate=self._sample_rate, - ) - lines = getattr(result, "lines", None) or [] - text = " ".join(getattr(ln, "text", "") for ln in lines).strip() - log.info("moonshine: text=%r", text[:80]) - return text - except Exception as e: - log.error("moonshine transcribe failed: %s", e) - return "" - - # ─── command recording ──────────────────────────────── - - def _record_command(self) -> np.ndarray: - """ - Record the user's command with a hysteretic, adaptive-baseline VAD. - - Design (handles quiet, normal, and loud voices on the G1 mic): - - 1. Sample 200 ms of ambient noise first to learn the floor, - then set the "silence" gate to max(ambient * 2.5, floor). - Eliminates the "my silence threshold is higher than my - user's speaking level" failure mode. - - 2. Two thresholds with hysteresis: - speech_entry — RMS required to count as "speech started" - silence_exit — RMS below which we count silence - (< speech_entry; prevents mid-word bail on - breaths and short consonant gaps). - - 3. Recording can only *end* after we've actually heard speech. - Pure silence just runs out to max_record_sec, then returns - empty (the caller plays "I didn't catch that" without - burning a Whisper call on noise). - - 4. After speech is seen, silence_budget accumulates only while - RMS stays below silence_exit. A single loud burst resets - it to zero — so natural "turn... left" pauses don't end the - recording. - """ - # ── config knobs (all overridable via config_Voice.json::stt) ─ - speech_entry_rms = float(self._stt.get("speech_entry_rms", 250.0)) - silence_exit_rms = float(self._stt.get("silence_exit_rms", 120.0)) - silence_dur = float(self._stt.get("silence_duration_sec", 1.2)) - max_dur = float(self._stt.get("max_record_sec", 8.0)) - min_dur = float(self._stt.get("min_record_sec", 0.4)) - ambient_probe_s = float(self._stt.get("ambient_probe_sec", 0.2)) - ambient_mult = float(self._stt.get("ambient_mult", 2.5)) - - small_chunk_bytes = 1024 - analysis_ms = 100 - analysis_bytes = int(self._sample_rate * analysis_ms / 1000) * 2 - - # ── 1. Reuse the wake detector's baseline instead of probing - # the mic right now. The wake detector's _baseline is a rolling - # mean of idle-silence RMS values from the last few seconds. - # - # Why NOT probe at record-time: we arrive here right after TTS - # "Yes", and the user typically starts speaking within 200 ms - # of hearing the ack. A probe window sized to the ambient floor - # then measures the *user's speech* as "ambient" and sets - # speech_entry above the user's actual amplitude — causing the - # "no speech in 8.00s" failure mode observed in the wild. - # - # Cap the baseline at a sensible ceiling so a one-off loud - # transient during idle doesn't lock us out either. - probe_buf = bytearray() # no probe audio kept - ambient_rms = getattr(self._detector, "_baseline", 0.0) or 0.0 - ambient_cap = float(self._stt.get("ambient_cap_rms", 200.0)) - ambient_rms = min(ambient_rms, ambient_cap) - - if ambient_rms > 0: - adaptive_exit = max(silence_exit_rms, ambient_rms * ambient_mult) - adaptive_entry = max(speech_entry_rms, ambient_rms * ambient_mult * 1.8) - else: - adaptive_exit, adaptive_entry = silence_exit_rms, speech_entry_rms - - log.info("vad: ambient_rms=%.0f (from wake baseline, cap=%.0f) " - "speech_entry=%.0f silence_exit=%.0f", - ambient_rms, ambient_cap, adaptive_entry, adaptive_exit) - - # ── 2. main capture loop ────────────────────────────────────── - collected = bytearray(probe_buf) # keep probe audio — user may - # have already started talking - analysis_buf = bytearray() - silence_budget = 0.0 - total_time = len(probe_buf) / 2 / self._sample_rate - speech_seen = False - peak_rms_seen = 0.0 - # Byte offset into `collected` at which speech first crossed - # adaptive_entry. We trim pre-speech silence to this point (minus - # ~300 ms pre-roll) before returning. Keeping Whisper's input - # tight (speech + small tails) improves transcription accuracy - # by removing the ambient/HVAC portion that dilutes the mel - # features. - speech_start_byte: Optional[int] = None - preroll_bytes = int(self._sample_rate * 0.3) * 2 # 300 ms - wall_start = time.time() - - while total_time < max_dur and (time.time() - wall_start) < max_dur + 2: - raw = self._read_mic_gained(small_chunk_bytes) - if not raw: - time.sleep(0.005) - continue - collected.extend(raw) - analysis_buf.extend(raw) - total_time += (len(raw) // 2) / self._sample_rate - - while len(analysis_buf) >= analysis_bytes: - win = np.frombuffer(bytes(analysis_buf[:analysis_bytes]), dtype=np.int16) - del analysis_buf[:analysis_bytes] - rms = float(np.sqrt(np.mean(win.astype(np.float64) ** 2))) - peak_rms_seen = max(peak_rms_seen, rms) - - if rms >= adaptive_entry: - if not speech_seen: - speech_seen = True - # Record where speech started (byte offset - # in `collected`) so we can trim pre-roll later. - speech_start_byte = max(0, len(collected) - preroll_bytes) - silence_budget = 0.0 - elif speech_seen and rms < adaptive_exit: - silence_budget += analysis_ms / 1000.0 - # between exit and entry → hold state (hysteresis zone) - - # end only *after* we've heard real speech - if (speech_seen - and silence_budget >= silence_dur - and total_time >= min_dur): - log.info("silence after speech at %.2fs (peak_rms=%.0f)", - total_time, peak_rms_seen) - break - - if not speech_seen: - log.info("no speech in %.2fs (peak_rms=%.0f < entry=%.0f) — dropping", - total_time, peak_rms_seen, adaptive_entry) - return np.array([], dtype=np.int16) - - if total_time >= max_dur: - log.info("max-record-sec hit at %.2fs (peak_rms=%.0f)", - total_time, peak_rms_seen) - - # Trim leading pre-speech silence. Keep 300 ms of pre-roll so - # the onset of the first phoneme is preserved for Whisper. - if speech_start_byte and speech_start_byte > 0: - trimmed_ms = speech_start_byte / 2 / self._sample_rate * 1000 - log.info("trimmed %.0f ms of leading silence " - "(pre-speech buffer %d bytes)", - trimmed_ms, speech_start_byte) - collected = collected[speech_start_byte:] - - return (np.frombuffer(bytes(collected), dtype=np.int16) - if collected else np.array([], dtype=np.int16)) - - # ─── transcription ──────────────────────────────────── - - def _transcribe(self, audio_i16: np.ndarray) -> str: - """int16 PCM → STT transcription. Returns '' on no-speech/noise.""" - if self._backend_name == "moonshine": - return self._transcribe_moonshine(audio_i16, lenient=False) - - fw = self._get_fw() - if fw is None: - return "" - - # mic_gain was already applied in _read_mic_gained() during - # _record_command, so audio_i16 here is already boosted. - - # int16 → float32 [-1, 1] + DSP pre-processing: - # 1. DC offset removal (subtract mean) — removes any mic bias - # 2. High-pass filter at 80 Hz — kills HVAC rumble, G1 fan noise, - # and speaker-vibration resonance. Whisper ignores the - # rumble band anyway, but it inflates RMS estimation and - # steals dynamic range from the speech band. - # 3. Pre-emphasis (0.97 coeff) — mild high-frequency boost - # that sharpens consonants (/t/, /s/, /k/ plosives/fricatives) - # which Whisper's mel features care most about. - # 4. Peak-normalize to 0.7. - audio_f32 = audio_i16.astype(np.float32) / 32768.0 - # 1. DC removal - audio_f32 = audio_f32 - np.mean(audio_f32) - # 2. High-pass at 80 Hz (1-pole IIR, stable + cheap) - audio_f32 = self._highpass_80hz(audio_f32) - # 3. Pre-emphasis y[n] = x[n] - 0.97 * x[n-1] - audio_f32 = np.append( - audio_f32[:1], audio_f32[1:] - 0.97 * audio_f32[:-1] - ) - # 4. Peak-normalize - peak = float(np.abs(audio_f32).max()) - if peak > 1e-4 and peak < 0.7: - boost = 0.7 / peak - audio_f32 = audio_f32 * boost - log.info("peak-normalized ×%.2f (peak %.3f → 0.70)", boost, peak) - - # Initial prompt biases the model toward our command vocabulary. - # Whisper uses this as decoder context — words in the prompt become - # more likely, which converts ambiguous low-SNR audio like "muv rahh" - # from a plausible English phrase ("and provide") into the intended - # command ("move right"). Keep short — long prompts can be echoed. - init_prompt = self._stt.get( - "whisper_initial_prompt", - "turn left, turn right, move forward, walk back, stop, come here, " - "sit down, stand up, raise arm, wave, look around, what do you see, " - "remember this, go home, patrol." - ) - - beam_size = int(self._stt.get("whisper_beam_size", 5)) - no_speech_threshold = float(self._stt.get("whisper_no_speech_threshold", 0.6)) - log_prob_threshold = float(self._stt.get("whisper_log_prob_threshold", -1.0)) - compression_ratio_t = float(self._stt.get("whisper_compression_ratio_threshold", 2.4)) - - # Temperature fallback: greedy first (T=0), then 0.2, then 0.4. - # Whisper retries automatically when a pass is rejected by - # its confidence gates (log_prob < threshold etc.). On noisy - # audio this commonly rescues a bad greedy decode. - temperatures = self._stt.get( - "whisper_temperature_fallback", [0.0, 0.2, 0.4] - ) - try: - segments, info = fw.transcribe( - audio_f32, - language="en", - beam_size=beam_size, # 5 = much better than greedy on noisy audio - temperature=temperatures, # greedy → 0.2 → 0.4 fallback - initial_prompt=init_prompt, # command-vocabulary bias (empty by default) - condition_on_previous_text=False, - vad_filter=False, # we already trimmed silence - without_timestamps=True, - # Whisper's built-in gates — drop transcripts that look - # like hallucinations (very low prob, highly compressed). - no_speech_threshold=no_speech_threshold, - log_prob_threshold=log_prob_threshold, - compression_ratio_threshold=compression_ratio_t, - ) - # Collect segments and their mean log-prob for a confidence signal. - seg_list = list(segments) - text = " ".join(s.text for s in seg_list).strip() - nsp = float(getattr(info, "no_speech_prob", 0.0)) - if seg_list: - mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list) - log.info("whisper: lp=%.2f nsp=%.2f text=%r", - mean_lp, nsp, text[:80]) - else: - # CRITICAL: log even when Whisper returned zero segments - # so we can see WHY it dropped everything. Usually nsp is - # above the threshold or the log-prob fallback killed it. - log.info("whisper: (no segments) nsp=%.2f thresholds: nsp>%.2f && lp<%.2f → drop", - nsp, no_speech_threshold, log_prob_threshold) - except Exception as e: - log.error("faster-whisper transcribe failed: %s", e) - return "" - - if not text: - return "" - - # Reject Whisper garbage patterns (stt.garbage_patterns) and - # transcriptions shorter than stt.min_transcription_length. - # Preserve: - # - bare wake words (valid "just Sanad" signal → ack) - # - exact matches in stt.command_vocab (legitimate short - # commands like "go", "hi" must survive the length filter) - low = text.lower().strip().rstrip(".!?,") - vocab_exact = {c.lower() for c in COMMAND_VOCAB} - if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH: - if low not in WAKE_WORDS and low not in vocab_exact: - log.info("Rejecting likely noise transcription: %r", text) - return "" - - # NOTE: fuzzy-match to canonical command phrase used to happen - # here, but it runs BEFORE gated-mode could see the wake word. - # Moved to _normalize_command() and called at dispatch time - # AFTER the wake-word gate + wake-word strip, so the gate - # always sees the raw Whisper text. - return text - - @staticmethod - def _highpass_80hz(x: np.ndarray, sr: int = 16_000) -> np.ndarray: - """ - 1-pole IIR high-pass at ~80 Hz. Attenuates HVAC/fan rumble - without touching the speech band. Cheap: 2 multiplies per sample. - """ - if x.size < 2: - return x - # Alpha from fc=80Hz: alpha = RC / (RC + dt), RC = 1/(2*pi*fc) - import math - rc = 1.0 / (2 * math.pi * 80.0) - dt = 1.0 / sr - alpha = rc / (rc + dt) - y = np.empty_like(x) - y[0] = x[0] - # vectorised enough — the loop is JITted by numpy internally - # for reasonable sizes (~25k samples). - prev_y, prev_x = x[0], x[0] - for i in range(1, x.size): - cur = alpha * (prev_y + x[i] - prev_x) - y[i] = cur - prev_y, prev_x = cur, x[i] - return y - - def _transcribe_raw(self, audio_i16: np.ndarray) -> str: - """ - Like _transcribe but WITHOUT the garbage-pattern / length filters - and without the `initial_prompt` bias. Used for wake verify, where: - - We only care about the first phoneme (s/sh/z) — a 2-char "so" - is a valid /sa-/ signature and MUST NOT be dropped by - min_transcription_length. - - A biased initial_prompt makes Whisper echo itself on unclear - audio ("This is a robot assistant" → not s-starting → reject). - The downside (no Sanad nudge) is fine here because the acoustic - detector has already gated out non-speech. - """ - if self._backend_name == "moonshine": - return self._transcribe_moonshine(audio_i16, lenient=True) - - fw = self._get_fw() - if fw is None: - return "" - if self._mic_gain != 1.0: - audio_i16 = np.clip( - audio_i16.astype(np.float32) * self._mic_gain, -32768, 32767 - ).astype(np.int16) - audio_f32 = audio_i16.astype(np.float32) / 32768.0 - peak = float(np.abs(audio_f32).max()) - if peak > 1e-4 and peak < 0.7: - audio_f32 = audio_f32 * (0.7 / peak) - try: - segments, info = fw.transcribe( - audio_f32, - language="en", - beam_size=int(self._stt.get("whisper_beam_size", 5)), - temperature=0.0, - initial_prompt="", # NO bias → NO prompt echo - condition_on_previous_text=False, - vad_filter=False, - without_timestamps=True, - # Looser gates — we're about to do phonetic match, - # not trust the transcription verbatim. - no_speech_threshold=0.85, - log_prob_threshold=-1.8, - compression_ratio_threshold=3.0, - ) - seg_list = list(segments) - text = " ".join(s.text for s in seg_list).strip() - if seg_list: - mean_lp = sum(getattr(s, "avg_logprob", 0.0) for s in seg_list) / len(seg_list) - log.info("whisper-raw: lp=%.2f nsp=%.2f text=%r", - mean_lp, getattr(info, "no_speech_prob", 0.0), text[:80]) - return text - except Exception as e: - log.error("whisper-raw transcribe failed: %s", e) - return "" - - def _transcribe_moonshine(self, audio_i16: np.ndarray, lenient: bool) -> str: - """ - Moonshine decode path. Light DSP only (DC-removal + peak-normalize); - Moonshine has its own internal feature extraction, and the Whisper- - oriented pre-emphasis / 80 Hz HPF are not helpful here. - - lenient=True mirrors _transcribe_raw: skip the garbage-pattern and - min-length filters so wake verify can see short /s-/ phonetic signals. - lenient=False applies the same rejection pipeline as _transcribe. - """ - if audio_i16.size == 0: - return "" - - audio_f32 = audio_i16.astype(np.float32) / 32768.0 - audio_f32 = audio_f32 - np.mean(audio_f32) - peak = float(np.abs(audio_f32).max()) - if peak > 1e-4 and peak < 0.7: - audio_f32 = audio_f32 * (0.7 / peak) - - text = self._moonshine_decode(audio_f32) - if not text: - return "" - if lenient: - return text - - low = text.lower().strip().rstrip(".!?,") - vocab_exact = {c.lower() for c in COMMAND_VOCAB} - if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH: - if low not in WAKE_WORDS and low not in vocab_exact: - log.info("Rejecting likely noise transcription: %r", text) - return "" - return text - - # ─── command transcription ──────────────────────────── - - def _transcribe_command(self, audio_i16: np.ndarray) -> str: - """ - Decode the recorded command audio with faster-whisper. Thin - wrapper over self._transcribe(); exists so _handle_wake and the - always-on loop share one entry point. - """ - if audio_i16.size == 0: - return "" - return self._transcribe(audio_i16) - - def _save_turn_wav( - self, audio_i16: np.ndarray, transcription: str = "", tag: str = "cmd", - ) -> Optional[str]: - """ - Save a single-turn command recording for debugging. - - Filename: {tag}_{epoch}_{sanitised_transcription}.wav - Examples: - cmd_1728562000_turn_right.wav ← successful command - cmd_1728562030_hi.wav ← Whisper misheard as 'Hi' - unk_1728562045_.wav ← Whisper returned empty - cmd_1728562060_thanks_for_watch.wav ← garbage-filtered - - Rotation: keeps the most recent 50 across all tags so the disk - doesn't fill up during a long session. Tunable via - stt.recording_keep_count. - """ - try: - import re as _re - import wave - rec_dir = os.path.join( - PROJECT_ROOT, - self._config.get("audio", {}).get("data_dir", "Data/Voice/Recordings"), - ) - os.makedirs(rec_dir, exist_ok=True) - - # Rotate — keep only the most recent N across all command WAVs. - keep = int(self._stt.get("recording_keep_count", 50)) - existing = sorted( - f for f in os.listdir(rec_dir) - if (f.startswith("cmd_") or f.startswith("unk_")) and f.endswith(".wav") - ) - for old in existing[:max(0, len(existing) - keep + 1)]: - try: os.remove(os.path.join(rec_dir, old)) - except Exception: pass - - # Sanitise transcription for filename: lowercase, alnum + _, <=40 chars - slug = _re.sub(r'[^a-z0-9]+', '_', (transcription or "").lower()).strip('_')[:40] - path = os.path.join( - rec_dir, f"{tag}_{int(time.time())}_{slug}.wav" - ) - with wave.open(path, "wb") as w: - w.setnchannels(1) - w.setsampwidth(2) - w.setframerate(self._sample_rate) - w.writeframes(audio_i16.astype(np.int16).tobytes()) - return path - except Exception as e: - log.warning("failed to save turn wav: %s", e) - return None - - def _save_unk_wav(self, audio_i16: np.ndarray) -> Optional[str]: - """Backward-compat wrapper — save with the `unk` tag.""" - return self._save_turn_wav(audio_i16, transcription="", tag="unk") - - # ─── command normalization (post-gate) ──────────────── - - def _normalize_command(self, text: str) -> str: - """ - Apply fuzzy-match to the closest canonical command phrase. - Call AFTER the gated wake check so the wake word has already - been stripped by the caller if appropriate. Turns near-misses - like "Turn right up" → "turn right" so command_parser.py's - regex fast-path can hit them without an LLM round-trip. - """ - canonical = _closest_command(text, cutoff=self._vocab_cutoff) - if canonical != text: - log.info("fuzzy-match: %r → %r", text, canonical) - return canonical + log.info("VoiceModule initialized (backend=gemini)") # ─── main loop ──────────────────────────────────────── def _voice_loop(self): """ - Dispatch to the right loop based on stt.mode: - "wake_and_command" — require "Sanad" wake word (acoustic), then - record and transcribe a command. - "always_on" — Transcribe every utterance, log all, and - dispatch all to the brain. No wake. - "always_on_gated" — Transcribe every utterance and log all, - but ONLY dispatch utterances that contain - "Sanad" (fuzzy). Wake word is stripped - before the command is sent to the brain. + Spawn the Gemini Live STT subprocess (runs in the gemini_sdk + Python 3.10+ env) and forward its transcripts into Marcus's + dispatch gate. Marcus's main process never opens the Gemini + WebSocket itself — google-genai needs Python ≥3.9 and marcus + is pinned to 3.8 by the Jetson torch wheel. """ - mode = self._stt.get("mode", "wake_and_command").lower() - self._mic_capture.start() - if mode in ("always_on", "always_on_gated"): - self._voice_loop_always_on(gated=(mode == "always_on_gated")) - else: - self._voice_loop_wake() + api_key = ( + os.environ.get("MARCUS_GEMINI_API_KEY") + or os.environ.get("SANAD_GEMINI_API_KEY") + or self._stt.get("gemini_api_key", "") + ) + if not api_key: + log.error( + "No Gemini API key found. Set env MARCUS_GEMINI_API_KEY " + "or stt.gemini_api_key in Config/config_Voice.json" + ) + while self._running: + time.sleep(0.5) + return - def _voice_loop_wake(self): - """Classic wake-and-command: listen for 'Sanad', then record command.""" - log.info("Voice loop started — listening for wake (energy-based)") + from Voice.gemini_script import GeminiBrain - was_speaking = False - while self._running: + # Env overrides for model + voice are passed through to the + # runner subprocess automatically (it reads the same env vars). + model = ( + os.environ.get("MARCUS_GEMINI_MODEL") + or self._stt.get( + "gemini_model", + "gemini-2.5-flash-native-audio-preview-12-2025", + ) + ) + voice_name = ( + os.environ.get("MARCUS_GEMINI_VOICE") + or self._stt.get("gemini_voice_name", "Charon") + ) + # System prompt: the runner reads the same config & file paths, + # but we forward the resolved string in case marcus's config layer + # picked a fallback. Forwarded via env in GeminiBrain.start(). + system_prompt = self._stt.get( + "gemini_system_prompt", + "Transcribe what the user says to Sanad. Stay silent.", + ) + sp_file = self._stt.get("gemini_system_prompt_file", "") + if sp_file: + sp_path = sp_file if os.path.isabs(sp_file) else os.path.join( + PROJECT_ROOT, sp_file, + ) try: - if self._audio.is_speaking: - was_speaking = True - time.sleep(0.1) - self._detector.reset() - continue - - if was_speaking: - time.sleep(0.25) - self._mic_capture.flush() - self._detector.reset() - was_speaking = False - - if time.time() < self._cooldown_until: - _ = self._read_mic_raw(1024) - self._detector.reset() - time.sleep(0.05) - continue - - chunk = self._read_mic_raw(1024) - if not chunk: - continue - - if self._detector.process(chunk): - self._handle_wake() + with open(sp_path, "r", encoding="utf-8") as f: + loaded = f.read().strip() + if loaded: + system_prompt = loaded + log.info( + "gemini system prompt loaded from %s (%d chars)", + sp_path, len(loaded), + ) except Exception as e: - log.error("Voice loop error: %s", e, exc_info=True) - time.sleep(1) + log.warning( + "gemini_system_prompt_file=%r unreadable: %s — " + "using inline config", sp_file, e, + ) - def _voice_loop_always_on(self, gated: bool = False): - """ - Always-on mode — Sanad-style continuous listening. - - If `gated` is True, utterances that don't contain the wake word - "Sanad" (or a fuzzy variant) are logged but NOT dispatched to the - brain — the robot hears everything, speaks only when addressed. - - Architecture (no wake word, no ack TTS): - 1. Continuously read the gained mic stream in 32 ms chunks. - 2. Run a hysteretic VAD on the stream — speech_entry_rms - starts an utterance, silence_exit_rms + silence_duration - ends one. - 3. On each utterance end → Whisper transcribe → fuzzy-match - → dispatch to brain. - 4. Every ~5 s of idle: log a `ambient: rms=... peak=...` line - so you can SEE what the mic is doing at all times, even - when nobody's talking. Matches Sanad's "always listening" - visibility. - 5. Speech is not gated on amplitude — everything above the - entry threshold is captured, quiet or loud. Loud speech - clips naturally against int16; Whisper handles it. - - Thresholds come from the same stt.* config as wake mode but are - typically tuned lower here (you want eager capture since there's - no wake-word gate to prevent false positives). - """ log.info( - "Voice loop started — ALWAYS-ON mode%s", - " [gated: only 'Sanad' utterances dispatched]" if gated - else " (no wake word — every utterance dispatched)" + "Voice loop started — GEMINI STT subprocess " + "(model=%s, voice=%s)", model, voice_name, ) - speech_entry = float(self._stt.get("always_on_speech_entry_rms", 250.0)) - silence_exit = float(self._stt.get("always_on_silence_exit_rms", 120.0)) - silence_dur = float(self._stt.get("always_on_silence_duration_sec", 0.8)) - min_utter_s = float(self._stt.get("always_on_min_utterance_sec", 0.3)) - max_utter_s = float(self._stt.get("always_on_max_utterance_sec", 12.0)) - idle_log_s = float(self._stt.get("always_on_idle_log_sec", 5.0)) - ambient_mult = float(self._stt.get("always_on_ambient_mult", 1.4)) - ambient_win = int(self._stt.get("always_on_ambient_window_chunks", 100)) + brain = GeminiBrain( + None, None, # audio_io, recorder owned by runner + voice_name=voice_name, + system_prompt=system_prompt, + api_key=api_key, + on_transcript=self._on_gemini_transcript, + on_command=self._dispatch_gemini_command, + ) + self._brain = brain + brain.start() - buffer = bytearray() - in_speech = False - silence_budget = 0.0 - speech_duration = 0.0 - peak_rms = 0.0 - idle_peak_rms = 0.0 - idle_sum_rms = 0.0 - idle_chunks = 0 - last_idle_log = time.time() - was_speaking_tts = False + try: + while self._running: + time.sleep(0.25) + finally: + brain.stop() + self._brain = None - # Rolling ambient (idle-only) RMS buffer. Used to adapt silence_exit - # so a noisy room doesn't trap the VAD at max_utter_s: if the - # observed idle floor sits at rms=200, silence_exit needs to be - # above 200 or silence never accumulates. We take - # effective_exit = max(config_silence_exit, ambient_floor * mult). - ambient_buf: list = [] - ambient_floor = 0.0 + # ─── dispatch side channel ──────────────────────────── - # Seed ambient_floor by sampling ~1s of mic BEFORE entering the - # loop. Without this, the very first utterance runs with - # ambient_floor=0 → eff_exit=config_floor, which under-cuts - # noisy rooms and creates self-sustaining echo loops. - seed_chunks = [] - seed_deadline = time.time() + 1.0 - while time.time() < seed_deadline: - r = self._read_mic_gained(1024) - if r: - a = np.frombuffer(r, dtype=np.int16) - if a.size: - seed_chunks.append( - float(np.sqrt(np.mean(a.astype(np.float64) ** 2))) - ) - else: - time.sleep(0.005) - if seed_chunks: - # Use the median so one loud transient doesn't poison the seed. - seed_chunks.sort() - ambient_floor = seed_chunks[len(seed_chunks) // 2] - ambient_buf = list(seed_chunks[-ambient_win:]) - log.info("ambient seeded: floor=%.0f from %d chunks", - ambient_floor, len(seed_chunks)) + def _on_gemini_transcript(self, text: str) -> None: + """Log every user transcript to logs/transcript.log.""" + if text: + _log_transcript("HEARD", text) - while self._running: - try: - # Drop mic input while the robot itself is speaking so we - # don't feed our own TTS back through Whisper. - if self._audio.is_speaking: - was_speaking_tts = True - buffer.clear() - in_speech = False - silence_budget = 0.0 - speech_duration = 0.0 - peak_rms = 0.0 - time.sleep(0.1) - continue + def _dispatch_gemini_command(self, text: str, lang: str = "en") -> None: + """ + Fire self._on_command for any transcript prefixed with the wake + word "Sanad". Marcus's brain is the authoritative decision maker + in the STT-only architecture — it handles motion AND Q&A AND + vision queries AND replies via TtsMaker. - if was_speaking_tts: - time.sleep(float(self._stt.get("post_tts_settle_sec", 0.3))) - self._mic_capture.flush() - was_speaking_tts = False + The vocab-match gate has been dropped: if the user says + "Sanad, what's the weather" the transcript still reaches the + brain, which either answers via its VLM or declines. This keeps + all Gemini-heard queries routed through one place (Marcus) and + removes the audio collision that full S2S had. - raw = self._read_mic_gained(1024) - if not raw: - time.sleep(0.005) - continue + Examples: + "Sanad, turn right" → strip → "turn right" → brain → motion + "Sanad, what do you see"→ strip → "what do you see" → brain → VLM + "Sanad" → bare wake → skip (no payload) + "turn right" → no wake word → skip (conversation gate) - arr = np.frombuffer(raw, dtype=np.int16) - rms = float(np.sqrt(np.mean(arr.astype(np.float64) ** 2))) - chunk_s = (len(raw) // 2) / self._sample_rate - - if in_speech: - buffer.extend(raw) - speech_duration += chunk_s - peak_rms = max(peak_rms, rms) - - # Adaptive silence exit: sits max(config_floor, - # ambient_floor × mult). Prevents the "room is noisier - # than silence_exit" failure mode where silence never - # accumulates and every utterance hits max_utter_s. - eff_exit = max(silence_exit, ambient_floor * ambient_mult) - if rms < eff_exit: - silence_budget += chunk_s - else: - silence_budget = 0.0 - - utter_over = (silence_budget >= silence_dur and - speech_duration >= min_utter_s) - force_stop = speech_duration >= max_utter_s - - if utter_over or force_stop: - reason = "max-duration" if force_stop else "silence" - audio = np.frombuffer(bytes(buffer), dtype=np.int16) - log.info("utterance end (%s): dur=%.2fs peak_rms=%.0f samples=%d", - reason, speech_duration, peak_rms, audio.size) - - # RESET STATE IMMEDIATELY — before any Whisper / - # speak() / dispatch. Previously a `continue` from - # the wake-only ack branch skipped the reset, and - # the 12-second buffer lived forever, re-transcribed - # every iteration into the same "Sanad" output, - # spawning a self-sustaining "Yes" loop. - buffer.clear() - in_speech = False - silence_budget = 0.0 - speech_duration = 0.0 - peak_rms = 0.0 - - text = self._transcribe_command(audio) if audio.size else "" - if text: - log.info("HEARD: %r", text) - _log_transcript("HEARD", text) - - # ── Two-turn gated flow ──────────────────── - # State A — listening for wake: - # non-wake utterance → log only, do not dispatch - # "Sanad " → strip + dispatch now - # "Sanad" alone → speak "Yes", switch to - # state B ("awaiting command") - # State B — awaiting command (after Yes): - # any utterance → dispatch as the command, - # regardless of wake word. - # Then back to state A. - # - # This matches the SanadVoice/gemini_interact - # pattern: always transcribe + log every word, - # say "yes" on wake, treat the next utterance - # as the command. - # Timeout stale await-command state - if self._awaiting_command and time.time() > self._await_deadline: - log.info(" [awaiting-cmd] timed out — back to wake-listen") - self._awaiting_command = False - - if self._awaiting_command: - # State B — next utterance is the command. - command = _strip_wake_word(text) # drop accidental "Sanad," - if not command: - command = text # safety: never drop to empty - command = self._normalize_command(command) - log.info(" [awaiting-cmd] dispatching: %r", command) - _log_transcript("CMD", command) - self._awaiting_command = False - print(f' [Sanad] heard: "{command}"') - if self._on_command: - try: - self._on_command(command, "en") - except Exception as e: - log.error("on_command: %s", e, exc_info=True) - continue - - # State A — listening for wake. - if gated and not _has_wake_word(text): - log.info(" (no wake word — logged only)") - _log_transcript("IGN", text) - continue - - if gated: - command = _strip_wake_word(text) - if command != text: - log.info(" wake-stripped: %r → %r", - text, command) - if not command: - # Bare "Sanad" — speak "Yes" and arm - # the next-utterance-as-command trigger. - log.info(" wake heard alone — speaking 'Yes', " - "next utterance will be treated as command") - _log_transcript("WAKE", text) - try: - self._audio.speak( - self._messages.get("wake_heard", "Yes") - ) - except Exception as e: - log.warning("wake-ack TTS failed: %s", e) - self._awaiting_command = True - self._await_deadline = time.time() + float( - self._stt.get("await_command_timeout_sec", 10.0) - ) - continue - else: - command = text - - # Sanad + command in one utterance (e.g. - # "Sanad, turn left") → fuzzy-normalize + dispatch. - command = self._normalize_command(command) - _log_transcript("CMD", command) - print(f' [Sanad] heard: "{command}"') - if self._on_command: - try: - self._on_command(command, "en") - except Exception as e: - log.error("on_command: %s", e, exc_info=True) - else: - log.info("utterance rejected (empty/garbage after Whisper)") - _log_transcript("UNK", "(empty)") - else: - idle_peak_rms = max(idle_peak_rms, rms) - idle_sum_rms += rms - idle_chunks += 1 - - # Maintain the rolling ambient floor so silence_exit can - # adapt. Use windows that are *clearly* not speech - # (rms < speech_entry / 2) — otherwise a borderline - # window just before transition pollutes the floor. - if rms < speech_entry * 0.5: - ambient_buf.append(rms) - if len(ambient_buf) > ambient_win: - ambient_buf.pop(0) - if ambient_buf: - ambient_floor = sum(ambient_buf) / len(ambient_buf) - - if rms >= speech_entry: - # utterance starts — keep this chunk as pre-roll - log.info("utterance start (rms=%.0f >= entry=%.0f)", - rms, speech_entry) - buffer.extend(raw) - in_speech = True - speech_duration = chunk_s - peak_rms = rms - silence_budget = 0.0 - - # periodic ambient log while idle — "I am listening" - now = time.time() - if (now - last_idle_log) >= idle_log_s and idle_chunks > 0: - eff_exit = max(silence_exit, ambient_floor * ambient_mult) - log.info("ambient: mean_rms=%.0f peak_rms=%.0f chunks=%d " - "floor=%.0f entry=%.0f eff_exit=%.0f", - idle_sum_rms / idle_chunks, idle_peak_rms, - idle_chunks, ambient_floor, speech_entry, eff_exit) - idle_peak_rms = 0.0 - idle_sum_rms = 0.0 - idle_chunks = 0 - last_idle_log = now - except Exception as e: - log.error("Always-on voice loop error: %s", e, exc_info=True) - time.sleep(1) - - def _handle_wake(self): - t_wake = time.time() - log.info("Wake detected (acoustic)") - - # Verify the burst that triggered wake actually sounds like a - # wake word. The acoustic detector fires on ANY 0.2-1.5s burst - # (coughs, claps, door slams). We run a lightweight Whisper - # decode on the burst and accept if EITHER: - # (a) a wake-word variant is in the transcription, OR - # (b) the transcription starts with 's'/'sh'/'z' — Whisper's - # consistent signature for mishearing non-English "Sanad" - # as an English /sa-/ word ("Stop", "Set", "Sand", "Send"). - # Reject if Whisper returns empty (pure noise / cough) or - # confidently not-s speech ("hello", "okay"). - if self._stt.get("wake_verify_enabled", True): - burst = self._detector.get_last_burst() - if burst is not None and burst.size >= int(0.15 * self._sample_rate): - t_verify = time.time() - # Lenient transcribe — no garbage filter, no min-length, - # no bias prompt. See _transcribe_raw docstring. - heard = self._transcribe_raw(burst) - verify_ms = (time.time() - t_verify) * 1000 - low = (heard or "").lower().strip().lstrip('"\'.,!?') - if not low: - log.info(" wake REJECTED — whisper empty (%.0fms)", verify_ms) - return - starts_with_s = low.startswith(("s", "sh", "z")) - if _has_wake_word(heard): - log.info(" wake verified (wake-word: %r, %.0fms)", - heard, verify_ms) - elif starts_with_s: - log.info(" wake verified (s-phonetic: %r, %.0fms)", - heard, verify_ms) - else: - log.info(" wake REJECTED — %r (%.0fms, not s-starting)", - heard, verify_ms) - return - - print("\n [Sanad] wake heard — listening…") - - ack_mode = self._stt.get("wake_ack", "tts").lower() - if ack_mode == "none": - log.info(" wake-ack: silent (no TTS)") - else: - try: - self._audio.speak(self._messages.get("wake_heard", "Yes")) - except Exception as e: - log.warning("TTS ack failed: %s", e) - - # Wait for ack TTS + speaker reverberation to decay - while self._audio.is_speaking: - time.sleep(0.05) - settle = float(self._stt.get("post_tts_settle_sec", 0.3)) - time.sleep(settle) - self._mic_capture.flush() - log.info(" wake→record-ready: %.2fs", time.time() - t_wake) - - log.info("Recording command...") - audio = self._record_command() - # _record_command returns empty if it never saw speech above the - # adaptive entry threshold — no point running STT on noise. - # Two cases: - # audio.size == 0 → no speech at all (likely false wake - # from cough/slam). SILENTLY reset — - # don't blurt "I didn't catch that" on - # what was never a real interaction. - # 0 < size < 8000 → brief speech burst (< 0.5s). Probably - # a real-but-unintelligible attempt; - # speak "I didn't catch that" so the - # user knows to retry. - if audio.size == 0: - log.info("Command dropped (no speech — likely false wake); silent reset") - self._cooldown_until = time.time() + float( - self._stt.get("command_cooldown_sec", 1.5)) - return - if audio.size < 8000: # < 0.5 s but > 0 — real short attempt - log.info("Command too short (%.2fs); asking user to repeat", - audio.size / self._sample_rate) - try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that")) - except Exception: pass - self._cooldown_until = time.time() + float( - self._stt.get("command_cooldown_sec", 1.5)) + Dedup: Gemini emits streaming partials; same normalized command + within command_cooldown_sec fires only once. + """ + if not text or not _has_wake_word(text): return - peak = int(np.abs(audio).max()) - rms = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) - log.info("command audio: samples=%d peak=%d rms=%.1f", - audio.size, peak, rms) - - text = self._transcribe_command(audio) - if not text: - log.info("Empty or rejected transcription") - # Save WAV of the failed transcription for post-mortem. - if self._stt.get("recording_enabled", True): - self._save_turn_wav(audio, transcription="", tag="unk") - try: self._audio.speak(self._messages.get("no_speech", "I didn't catch that")) - except Exception: pass - self._cooldown_until = time.time() + float( - self._stt.get("command_cooldown_sec", 1.5)) + stripped = _strip_wake_word(text) + if not stripped or len(stripped.strip()) < _MIN_TRANSCRIPTION_LENGTH: return - # Normalize near-miss transcriptions like "Turn right up" → "turn - # right" so the brain's regex fast-path catches them. - text = self._normalize_command(text) - log.info("Transcribed: %s", text[:120]) + low = stripped.lower().strip().rstrip(".!?,") + vocab_exact = {c.lower() for c in COMMAND_VOCAB} + if low in GARBAGE_PATTERNS and low not in vocab_exact: + return - # Save every successful command recording so you can listen back - # later and see what the mic actually heard vs what Whisper - # transcribed. Disable with stt.recording_enabled=false. - if self._stt.get("recording_enabled", True): - wav_path = self._save_turn_wav(audio, transcription=text, tag="cmd") - if wav_path: - log.info("saved: %s", os.path.basename(wav_path)) + # Fuzzy-normalize (maps "turn right up" → "turn right") if the + # transcript is close to a vocab entry — but unlike before, we + # forward everything that passed the wake-word gate, not just + # vocab hits. Marcus's command_parser + VLM handles the rest. + command = self._normalize_command(stripped) + canon = command.lower().strip().rstrip(".!?,") + now = time.time() + cooldown = float(self._stt.get("command_cooldown_sec", 1.5)) + if (canon == self._last_gemini_canon + and now - self._last_gemini_dispatch_at < cooldown): + return + self._last_gemini_canon = canon + self._last_gemini_dispatch_at = now + + log.info("dispatch (gemini): %s", command[:120]) + _log_transcript("CMD", command) if self._on_command: try: - self._on_command(text, "en") + self._on_command(command, lang or "en") except Exception as e: log.error("on_command error: %s", e, exc_info=True) - elif self._on_wake: - try: self._on_wake() - except Exception: pass - cd = float(self._stt.get("command_cooldown_sec", 1.5)) - self._cooldown_until = time.time() + cd - log.info("wake→dispatch total: %.2fs | cooldown %.1fs", - time.time() - t_wake, cd) + def flush_mic(self) -> None: + """ + Tell the Gemini runner subprocess to drop its mic buffer. Called + before AND after `audio_api.speak()` so the robot's own voice + (picked up by the mic during TtsMaker playback) doesn't come back + from Gemini as a fake user utterance. + No-op if the runner hasn't started yet. + """ + b = getattr(self, "_brain", None) + if b is None: + return + try: + b.flush_mic() + except Exception: + pass + + def _normalize_command(self, text: str) -> str: + """Fuzzy-match a transcription to the closest canonical phrase.""" + canonical = _closest_command(text, cutoff=self._vocab_cutoff) + if canonical != text: + log.info("fuzzy-match: %r → %r", text, canonical) + return canonical # ─── start / stop ───────────────────────────────────── @@ -1313,33 +393,23 @@ class VoiceModule: log.warning("VoiceModule already running") return self._running = True - self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice") + self._thread = threading.Thread( + target=self._voice_loop, daemon=True, name="voice", + ) self._thread.start() log.info("Voice module started") def stop(self): self._running = False - try: self._mic_capture.stop() - except Exception: pass if self._thread: self._thread.join(timeout=5) self._thread = None log.info("Voice module stopped") @property - def is_running(self) -> bool: - return self._running - - -if __name__ == "__main__": - from API.audio_api import AudioAPI - def on_cmd(text, lang): - print(f"\n COMMAND [{lang}]: {text}\n") - audio = AudioAPI() - voice = VoiceModule(audio, on_command=on_cmd) - print('Starting. Say "Sanad", then speak your command.\n') - voice.start() - try: - while voice.is_running: time.sleep(0.5) - except KeyboardInterrupt: - voice.stop() + def is_speaking(self) -> bool: + """Delegates to AudioAPI — True while TtsMaker is playing.""" + try: + return bool(self._audio.is_speaking) + except Exception: + return False diff --git a/Voice/turn_recorder.py b/Voice/turn_recorder.py new file mode 100644 index 0000000..2c2fc46 --- /dev/null +++ b/Voice/turn_recorder.py @@ -0,0 +1,158 @@ +"""Per-turn WAV recorder for voice brains. + +Direct port of Project/Sanad/voice/sanad_voice.py::TurnRecorder. Saves each +conversation turn as two WAV files: + + _user.wav mono int16 @ 16 kHz (what the mic captured) + _robot.wav mono int16 @ 24 kHz (what the brain spoke) + +Plus an index.json that appends one entry per turn with the transcripts. + +A turn starts when audio first flows through `capture_user` or +`capture_robot`, and ends on `finish_turn`. Call pattern matches Sanad +exactly: `capture_user`, `capture_robot`, `add_user_text`, `add_robot_text`, +`finish_turn`. + +Disable via config: stt.gemini_record_enabled = false (the caller passes +`enabled=False`). +""" + +from __future__ import annotations + +import json +import logging +import os +import threading +import time +import wave +from datetime import datetime + +log = logging.getLogger("turn_recorder") + + +class TurnRecorder: + """Saves each turn as two WAV files: user mic + model output.""" + + def __init__( + self, + enabled: bool = True, + out_dir: str = "", + user_rate: int = 16000, + robot_rate: int = 24000, + ): + self.enabled = bool(enabled) and bool(out_dir) + self.out_dir = out_dir + self.user_rate = int(user_rate) + self.robot_rate = int(robot_rate) + if self.enabled: + os.makedirs(self.out_dir, exist_ok=True) + self._lock = threading.Lock() + self._user_buf = [] + self._robot_buf = [] + self._user_text = "" + self._robot_text = "" + self._started_at = 0.0 + + def capture_user(self, pcm: bytes) -> None: + if not self.enabled or not pcm: + return + with self._lock: + if not self._user_buf and not self._robot_buf: + self._started_at = time.time() + self._user_buf.append(pcm) + + def capture_robot(self, pcm: bytes) -> None: + if not self.enabled or not pcm: + return + with self._lock: + if not self._user_buf and not self._robot_buf: + self._started_at = time.time() + self._robot_buf.append(pcm) + + def add_user_text(self, text: str) -> None: + if text and self.enabled: + with self._lock: + self._user_text = (self._user_text + " " + text).strip() + + def add_robot_text(self, text: str) -> None: + if text and self.enabled: + with self._lock: + self._robot_text = (self._robot_text + " " + text).strip() + + def finish_turn(self) -> dict: + if not self.enabled: + return {} + with self._lock: + user_data = b"".join(self._user_buf) + robot_data = b"".join(self._robot_buf) + user_text = self._user_text + robot_text = self._robot_text + started_at = self._started_at + self._user_buf.clear() + self._robot_buf.clear() + self._user_text = "" + self._robot_text = "" + + if not user_data and not robot_data: + return {} + + stamp = datetime.fromtimestamp(started_at).strftime("%Y%m%d_%H%M%S") + entry = { + "timestamp": stamp, + "started_at": started_at, + "user_text": user_text, + "robot_text": robot_text, + } + try: + if user_data: + p = os.path.join(self.out_dir, "{}_user.wav".format(stamp)) + self._save_wav(p, user_data, self.user_rate) + entry["user_wav"] = p + entry["user_duration_sec"] = round( + len(user_data) / (self.user_rate * 2), 3, + ) + if robot_data: + p = os.path.join(self.out_dir, "{}_robot.wav".format(stamp)) + self._save_wav(p, robot_data, self.robot_rate) + entry["robot_wav"] = p + entry["robot_duration_sec"] = round( + len(robot_data) / (self.robot_rate * 2), 3, + ) + self._append_index(entry) + log.info( + "recorded turn → %s (user %.1fs, robot %.1fs)", + stamp, + entry.get("user_duration_sec", 0), + entry.get("robot_duration_sec", 0), + ) + except Exception as exc: + log.warning("recording save failed: %s", exc) + return entry + + @staticmethod + def _save_wav(path: str, pcm: bytes, rate: int) -> None: + with wave.open(path, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(rate) + wf.writeframes(pcm) + + def _append_index(self, entry: dict) -> None: + idx_path = os.path.join(self.out_dir, "index.json") + try: + if os.path.exists(idx_path): + with open(idx_path, "r", encoding="utf-8") as f: + payload = json.load(f) + if not isinstance(payload, dict): + payload = {"records": []} + else: + payload = {"records": []} + except Exception: + payload = {"records": []} + payload.setdefault("records", []).append(entry) + payload["total_records"] = len(payload["records"]) + try: + with open(idx_path, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, ensure_ascii=False) + except Exception as exc: + log.warning("index.json write failed: %s", exc) diff --git a/Voice/wake_detector.py b/Voice/wake_detector.py deleted file mode 100644 index 640ad11..0000000 --- a/Voice/wake_detector.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/env python3 -""" -Voice/wake_detector.py — custom wake-word detector (no ML, no Vosk, no Whisper). - -Energy-envelope state machine. Monitors raw PCM audio and fires a wake -event when it sees a short speech burst (sized to match a single spoken -word like "Sanad") followed by a clear silence. - -Why this exists: - Vosk's small English lexicon doesn't contain the word "sanad" and - substitutes arbitrary English words ("us", "of", "senate"). Whisper on - this Jetson's torch-aarch64 produces "!!!!!" garbage. Both are broken - for this specific hardware + wake word. An acoustic detector using - only numpy doesn't care what the word actually is — it detects the - *shape* of a single spoken word in the audio energy envelope. - -Algorithm (state machine): - SILENCE ──(rms > speech_threshold)──> SPEAKING - SPEAKING ──(rms < silence_threshold for N chunks)──> ANALYZE - ANALYZE: if 0.2 s < speech_duration < 1.5 s → fire WAKE - else → reset to SILENCE (too short = cough, too long = sentence) - after fire → COOLDOWN for 1.5 s before next detection - -What it does NOT do: - - Does not identify which word was spoken (anything in the - duration range triggers) - - Does not transcribe follow-on commands (you type those at the - terminal) - - Does not protect against loud non-speech (clapping, door slam) - -Usage: - from Voice.wake_detector import WakeDetector - det = WakeDetector(sample_rate=16000) - while True: - chunk = mic.read_chunk(1024) # bytes of int16 PCM - if det.process(chunk): - print("Wake!") -""" - -from __future__ import annotations - -import time -from dataclasses import dataclass -from typing import Optional - -import numpy as np - - -@dataclass -class WakeConfig: - sample_rate: int = 16_000 - # RMS (int16 units) FLOOR for "this chunk is speech". The effective - # threshold is max(speech_threshold, ambient_baseline * adaptive_mult) - # so this is only a minimum guarantee — the detector adapts upward - # in noisy rooms but never below this floor. - # G1 far-field mic at normal speaking distance has rms ~ 80-400 for - # quiet speech, 400-1500 for clear speech. 80 catches quiet speech; - # raise to 120-150 if fan/typing noise triggers false wakes. - speech_threshold: float = 80.0 - # How long a burst of speech must last to count as a "word". - min_word_duration_s: float = 0.20 - max_word_duration_s: float = 1.50 - # How long of continuous silence we need to consider the word ended. - post_silence_s: float = 0.30 - # Minimum gap between two consecutive wake fires. Prevents a single - # spoken word from triggering twice. - cooldown_s: float = 1.50 - # RMS window size — we analyze this many ms of audio per step. - chunk_ms: int = 50 - # Adaptive: how many *recent silent* chunks to average for the noise - # floor, and the multiplier applied on top. effective_threshold = - # max(speech_threshold, baseline * adaptive_mult). - adaptive_window_n: int = 50 # ~2.5 s at 50 ms chunks - adaptive_mult: float = 3.0 - # Periodic diagnostic log cadence (seconds). 0 disables. - diag_log_sec: float = 3.0 - - -class WakeDetector: - """Streaming acoustic wake detector — no language model required.""" - - STATE_SILENCE = "SILENCE" - STATE_SPEAKING = "SPEAKING" - - def __init__(self, cfg: Optional[WakeConfig] = None): - self.cfg = cfg or WakeConfig() - self._chunk_samples = int(self.cfg.sample_rate * self.cfg.chunk_ms / 1000) - self._min_speech = int(self.cfg.min_word_duration_s * self.cfg.sample_rate) - self._max_speech = int(self.cfg.max_word_duration_s * self.cfg.sample_rate) - self._post_silence = int(self.cfg.post_silence_s * self.cfg.sample_rate) - - self._state = self.STATE_SILENCE - self._speech_start = 0 # sample index where current burst began - self._silence_run = 0 # consecutive silent samples inside SPEAKING - self._sample_cursor = 0 # running sample count since start - self._cooldown_until = 0.0 # wall-clock time after which we can fire again - - # A small rolling buffer of leftover samples (when the caller's - # chunks don't align with our internal analysis window). - self._carry = np.zeros(0, dtype=np.int16) - - # Audio of the most-recent wake-triggering burst. Saved when the - # detector fires so callers (marcus_voice) can run Whisper on it - # and verify the word was actually "Sanad" rather than a cough. - self._burst_samples: list = [] # accumulated during SPEAKING - self._last_burst_audio: Optional[np.ndarray] = None - - # Adaptive noise floor (rolling mean of RMS during SILENCE chunks). - self._baseline_buf = [] # last N silent-window RMS values - self._baseline = 0.0 # current estimate - self._peak_since_diag = 0.0 # max rms since last diag log - self._last_diag = time.time() - # Logger is optional — if the host app set up logging, use it. - try: - import logging - self._log = logging.getLogger("wake_detector") - except Exception: - self._log = None - - # ── public API ──────────────────────────────────────────────── - - def process(self, pcm_bytes: bytes) -> bool: - """ - Feed int16 PCM bytes. Returns True once per spoken "word" - (short speech burst followed by silence). - """ - if not pcm_bytes: - return False - incoming = np.frombuffer(pcm_bytes, dtype=np.int16) - samples = np.concatenate([self._carry, incoming]) if self._carry.size else incoming - - fired = False - n = self._chunk_samples - i = 0 - while i + n <= samples.size: - window = samples[i:i + n] - if self._step(window): - fired = True - # break — flush the rest on next call so we get one fire per word - i += n - break - i += n - self._sample_cursor += n - - # Keep whatever didn't fit in a full window for next call. - self._carry = samples[i:].copy() - return fired - - def reset(self) -> None: - """Drop all state — call when resuming from a long pause.""" - self._state = self.STATE_SILENCE - self._silence_run = 0 - self._carry = np.zeros(0, dtype=np.int16) - self._burst_samples = [] - - def get_last_burst(self) -> Optional[np.ndarray]: - """ - Return the int16 PCM samples of the most-recent wake-triggering - burst, or None if no wake has fired yet. Used by marcus_voice to - verify the triggering word was actually 'Sanad' before proceeding. - """ - return self._last_burst_audio - - # ── internal ────────────────────────────────────────────────── - - def _step(self, window: np.ndarray) -> bool: - rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2))) - - # Effective threshold = max(config floor, adaptive baseline * mult) - eff = self.cfg.speech_threshold - if self._baseline > 0: - eff = max(eff, self._baseline * self.cfg.adaptive_mult) - is_speech = rms > eff - - # Track peak for diag. Log periodically so you can *see* what the - # detector is hearing — invaluable when "not hearing me" happens. - if rms > self._peak_since_diag: - self._peak_since_diag = rms - now = time.time() - if self.cfg.diag_log_sec > 0 and (now - self._last_diag) >= self.cfg.diag_log_sec: - if self._log is not None: - self._log.info( - "wake: peak=%.0f baseline=%.0f eff_threshold=%.0f state=%s", - self._peak_since_diag, self._baseline, eff, self._state, - ) - self._peak_since_diag = 0.0 - self._last_diag = now - - if now < self._cooldown_until: - return False # silent during cooldown - - if self._state == self.STATE_SILENCE: - # Learn the noise floor ONLY in silence — so speech bursts - # don't pull the baseline up and lock us out of wake. - if not is_speech: - self._baseline_buf.append(rms) - if len(self._baseline_buf) > self.cfg.adaptive_window_n: - self._baseline_buf.pop(0) - if self._baseline_buf: - self._baseline = sum(self._baseline_buf) / len(self._baseline_buf) - if is_speech: - self._state = self.STATE_SPEAKING - self._speech_start = self._sample_cursor - self._silence_run = 0 - # Begin capturing the burst audio for later Whisper verify. - self._burst_samples = [window.copy()] - return False - - # STATE_SPEAKING - # Accumulate every window (speech OR silence inside the burst) - # so we capture the full word + trailing quiet. - self._burst_samples.append(window.copy()) - - if is_speech: - self._silence_run = 0 - # Abort if the burst is longer than a single word — user is - # just talking, not addressing the robot. - if self._sample_cursor - self._speech_start > self._max_speech: - self._state = self.STATE_SILENCE - self._burst_samples = [] - return False - - # Silent window inside SPEAKING — accumulate. - self._silence_run += window.size - if self._silence_run >= self._post_silence: - speech_len = (self._sample_cursor - self._silence_run) - self._speech_start - self._state = self.STATE_SILENCE - self._silence_run = 0 - if self._min_speech <= speech_len <= self._max_speech: - # Snapshot burst audio for the caller's Whisper verify. - self._last_burst_audio = ( - np.concatenate(self._burst_samples) - if self._burst_samples else None - ) - self._burst_samples = [] - self._cooldown_until = now + self.cfg.cooldown_s - return True - return False - - -# ── standalone test ───────────────────────────────────────────── - -if __name__ == "__main__": - import os - import sys - _HERE = os.path.dirname(os.path.abspath(__file__)) - sys.path.insert(0, os.path.dirname(_HERE)) - from Voice.builtin_mic import BuiltinMic - - print("WakeDetector standalone test — say 'Sanad' a few times.") - print("(Ctrl-C to quit)\n") - det = WakeDetector() - mic = BuiltinMic() - mic.start() - try: - while True: - chunk = mic.read_chunk(1024) - if det.process(chunk): - print(f" [WAKE] (t={time.strftime('%H:%M:%S')})") - except KeyboardInterrupt: - pass - finally: - mic.stop()