diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py index 8f63b65..60fa09b 100644 --- a/Brain/marcus_brain.py +++ b/Brain/marcus_brain.py @@ -198,13 +198,32 @@ def _init_voice(): _audio_api = AudioAPI() + # Heuristic filter for unusable Gemini transcripts. Gemini emits + # `` literally when audio is non-speech and `.` for empty + # bursts. These shouldn't pollute the terminal or trigger motion. + def _is_garbage_transcript(t: str) -> bool: + stripped = t.strip().strip(".!?,").strip() + if not stripped: + return True + low = stripped.lower() + if low in ("", "noise", "yeah", "ok", "okay", "uh", "um", "hmm", "mm"): + return True + # Bare single character (often "." → ".") or all punctuation. + if all(not c.isalnum() for c in stripped): + return True + # Short non-ASCII fragments (e.g. Korean / Thai / Arabic + # snippets that come from echo or distortion). + if len(stripped) <= 6 and not all(ord(c) < 128 for c in stripped): + return True + return False + def _on_command(text, lang): text = (text or "").strip() if not text: return - # One clean, distinctive line so the operator can see exactly - # what Whisper transcribed before the brain reacts. Everything - # else from the voice subsystem is file-only. + if _is_garbage_transcript(text): + # Skip silently — neither show nor dispatch to brain. + return print(f' [Sanad] heard: "{text}"') try: result = process_command(text) @@ -213,19 +232,16 @@ def _init_voice(): return if isinstance(result, dict): sp = (result.get("speak") or "").strip() - if sp and _audio_api: - # Drop Gemini's mic buffer so the robot's own voice - # (picked up by the mic during TtsMaker playback) - # doesn't get transcribed and fed back as a new - # "user" utterance. - if _voice_module is not None: - try: _voice_module.flush_mic() - except Exception: pass - _audio_api.speak(sp) - if _voice_module is not None: - try: _voice_module.flush_mic() - except Exception: pass - # Redraw the Command: prompt that our print clobbered + act = (result.get("action") or "").strip() + # In the Gemini S2S architecture Gemini owns the voice; we + # do NOT call audio_api.speak(sp) here (would collide with + # Gemini's own audio reply). Just show the operator what + # the brain decided so they can correlate motion with + # Gemini's spoken acknowledgement. + if act and act not in ("NONE", "TALK"): + print(f" [Sanad] doing: {act}{f' — {sp[:80]}' if sp else ''}") + elif sp: + print(f" [Brain] {sp[:120]}") print("Command: ", end="", flush=True) _voice_module = VoiceModule(_audio_api, on_command=_on_command) diff --git a/Config/config_Voice.json b/Config/config_Voice.json index de176ba..3e3c4f6 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -7,9 +7,9 @@ }, "stt": { - "_comment": "Voice pipeline: Gemini Live STT (text-mode) → Marcus brain → TtsMaker. Gemini transcribes the user's speech with server-side VAD; Marcus's brain (Brain/marcus_brain.py) decides the reply and speaks it via AudioAPI.speak → TtsMaker. No audio comes back from Gemini (response_modalities=['TEXT']). Install on Jetson: `pip install google-genai`. API key: env MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY fallback).", + "_comment": "Voice pipeline: Gemini Live SPEECH-TO-SPEECH (Sanad pattern). Gemini hears the mic, sees camera frames streamed over from Marcus, and replies with its own voice through the G1 speaker. Marcus's brain still dispatches motion commands via a side channel — when the transcript matches 'Sanad + action', Marcus's command_parser fires the motion silently while Gemini speaks the verbal acknowledgement. The brain's `speak` reply is logged but NOT spoken (avoids double-audio collision with Gemini). Install on Jetson (gemini_sdk env): `pip install google-genai`. API key: env MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY fallback).", - "_gemini_comment": "Gemini Live STT-only settings. The actual Gemini WebSocket runs in a SEPARATE Python 3.10+ subprocess (Voice/gemini_runner.py) because google-genai requires Python ≥3.9 and marcus is pinned to Python 3.8 by the NVIDIA Jetson torch wheel. The marcus parent process spawns `gemini_python_path -u Voice/gemini_runner.py` and parses the JSON-line transcripts on stdout. Env overrides: MARCUS_GEMINI_API_KEY / MARCUS_GEMINI_MODEL / MARCUS_GEMINI_PYTHON.", + "_gemini_comment": "Gemini Live S2S settings. The actual Gemini WebSocket runs in a SEPARATE Python 3.10+ subprocess (Voice/gemini_runner.py) because google-genai requires Python ≥3.9 and marcus is pinned to Python 3.8 by the NVIDIA Jetson torch wheel. The runner ALSO owns the G1 speaker (unitree_sdk2py works in gemini_sdk env) so Gemini's audio plays directly without IPC. The marcus parent process forwards camera frames to the runner via stdin so Gemini can see what the robot sees. Env overrides: MARCUS_GEMINI_API_KEY / MARCUS_GEMINI_MODEL / MARCUS_GEMINI_VOICE / MARCUS_GEMINI_PYTHON.", "_gemini_python_path_comment": "Path to a Python 3.10+ binary that has `google-genai` installed (typically a separate conda env, e.g. `gemini_sdk` on this Jetson). Leave empty to auto-detect — the manager tries ~/miniconda3/envs/gemini_sdk/bin/python and a few common alternates. Override at runtime via env MARCUS_GEMINI_PYTHON.", "gemini_python_path": "", "gemini_api_key": "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8", @@ -18,11 +18,26 @@ "gemini_audio_profile": "builtin", "gemini_chunk_size": 512, "gemini_send_sample_rate": 16000, + "gemini_receive_sample_rate": 24000, "gemini_record_enabled": true, - "_gemini_system_prompt_comment": "Marcus brain is the authoritative reply path; Gemini is just an ear here. Keep the prompt short — it tells Gemini to transcribe, not to chat. Override by pointing gemini_system_prompt_file at a text file (relative paths resolve from PROJECT_ROOT).", + "_gemini_camera_comment": "Stream camera frames to Gemini Live so vision answers ('what do you see') are correct rather than hallucinated. Marcus parent grabs JPEG frames via API.camera_api.get_frame() at gemini_frame_interval_sec cadence and pipes them to the runner over stdin. Frame_max_age_sec drops stale frames. Set gemini_send_frames=false to disable (saves API tokens but breaks vision questions).", + "gemini_send_frames": true, + "gemini_frame_interval_sec": 0.5, + "gemini_frame_max_age_sec": 1.5, + + "_gemini_barge_comment": "Barge-in = user speaking over Gemini. Three loud chunks above barge_threshold interrupts Gemini mid-sentence. echo_suppress_below masks mic frames quieter than the threshold during playback so the mic doesn't re-feed Gemini its own voice. On the G1 the on-board speaker is loud enough that ECHO frames hit ~1500-3000 RMS, well above the 500 barge threshold — that's why earlier sessions saw self-interrupt loops. Tuned values: threshold 3500 (only a real shout cuts Gemini off), echo_suppress_below 3500 (mute everything below that during AI playback — anything quieter than the speaker's own echo is treated as silence). ai_speak_grace_sec 0.5 gives Gemini a half-second runway before barge can fire. If you find users genuinely can't interrupt Gemini, drop barge_threshold to ~2500 and accept some self-interrupts.", + "gemini_barge_threshold": 3500, + "gemini_barge_loud_chunks_needed": 5, + "gemini_barge_cooldown_sec": 0.5, + "gemini_echo_suppress_below": 3500, + "gemini_ai_speak_grace_sec": 0.5, + "gemini_begin_stream_pause_sec": 0.15, + "gemini_wait_finish_margin_sec": 0.3, + + "_gemini_system_prompt_comment": "Persona for Gemini Live's spoken reply. Gemini owns the voice in this architecture, so make this prompt match the experience you want users to hear. The robot's body is controlled by Marcus's brain via a side channel — Gemini doesn't need to invoke motions itself, just acknowledge them naturally. Override by pointing gemini_system_prompt_file at a text file (relative paths resolve from PROJECT_ROOT).", "gemini_system_prompt_file": "", - "gemini_system_prompt": "You are Sanad's ear. Your only job is to transcribe what the user says to Sanad, the humanoid robot. Do not respond conversationally. Do not speculate. Do not invent dialogue. If the user addresses Sanad, return exactly what they said. Stay completely silent in your response.", + "gemini_system_prompt": "You are Sanad (سند), a friendly humanoid robot assistant made by YS Lootah Technology in Dubai. Your body is a Unitree G1 humanoid. You can see the user through your camera and talk to them in real time. You speak both English and Arabic naturally — match the user's language in your reply. Reply briefly, usually one or two sentences. When the user asks 'what do you see' / 'ماذا ترى' or describes the scene, look at the camera frames you're receiving and answer accurately based on what's actually there; do not invent details. CRITICAL ACTION RULE — physical motion only happens when the user addresses you by name 'Sanad' (English) or 'سند' (Arabic) AND gives an action. Examples: 'Sanad, turn right' → say 'Turning right.' 'سند، استدر يميناً' → say 'أستدير يميناً.' Plain conversation or vision queries WITHOUT 'Sanad' / 'سند' are fine but DO NOT trigger any motion confirmation — just chat or describe. NEVER say 'Turning' / 'Moving' / 'Sitting' / 'أستدير' / 'أتحرك' unless the user actually said 'Sanad' / 'سند' first. When you do say a motion confirmation, use the same language the user used. Motion verbs supported (English / Arabic): turn left/right (استدر يميناً/يساراً), turn around (استدر للخلف), move forward/back (تحرك للأمام/للخلف), sit down (اجلس), stand up (قف), wave hello (لوّح), raise/lower arm (ارفع/اخفض يدك), come here (تعال), follow me (اتبعني), stay here (ابق هنا), go home (اذهب للبيت), stop (توقف), patrol (طوف), look around (انظر حولك).", "_gemini_vad_comment": "Gemini server-side VAD tuning. start_sensitivity/end_sensitivity accept 'START_SENSITIVITY_HIGH|LOW' and 'END_SENSITIVITY_HIGH|LOW'. HIGH start = eagerly treats any speech-like sound as turn start, LOW = more conservative. LOW end = longer patience before ending a turn, HIGH = cuts turn sooner. prefix_padding_ms preserves audio from just before speech is detected. silence_duration_ms is how long of quiet ends a turn.", "gemini_vad_start_sensitivity": "START_SENSITIVITY_HIGH", @@ -43,37 +58,7 @@ "command_cooldown_sec": 1.5, "min_transcription_length": 3, - "_vocab_comment": "wake_words = variants Gemini may produce for 'Sanad' — word-boundary matched in the user transcript. command_vocab = canonical command phrases. The dispatcher fuzzy-matches the transcript (after wake-word strip) against command_vocab. garbage_patterns lists short noise phrases Gemini sometimes emits — rejected before fuzzy-match unless they happen to equal a vocab entry exactly. Edit these to add new vocabulary — NO code change required.", - "wake_words": [ - "sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad", - "sanat", "sunnat", "sonnat", "sinnat", "sennat", - "sanid", "sanud", "saned", "sanod", "sanaad", - "senad", "sinad", "sonad", "sunad", - "sanah", "sanath", "sanadh", "sonadh", - "samad", "somad", "sumad", - "thanad", "zanad", - "sa nad", "san ad", "san odd", "san add" - ], - "command_vocab": [ - "what do you see", "what can you see", "look around", - "come to me", "come here", "come back", "come closer", - "approach", "get closer", "come", - "go home", "go back", "go forward", "go backward", - "go left", "go right", "go", - "sit down", "stand up", "sit", "stand", - "raise arm", "lower arm", "wave hello", "wave", "point", - "turn left", "turn right", "turn around", - "move forward", "move backward", "move back", - "move left", "move right", - "walk forward", "walk backward", "walk back", - "step forward", "step back", "step left", "step right", - "forward", "backward", "back", "left", "right", - "patrol", "stop", "halt", "wait", "pause", "freeze", "hold", - "hello", "hi", "hey", "help", - "who are you", "where are you", "where am i", "what is your name", - "remember this", "forget", "do it again", "repeat", "undo", - "follow me", "stay here" - ], + "_vocab_comment": "wake_words and command_vocab now live in Config/instruction.json — single source of truth for all bilingual phrase tables (wake variants + per-action user_phrases + per-action bot_phrases, English AND Arabic). garbage_patterns stays here because it's noise filtering, not voice instruction.", "garbage_patterns": [ "thanks for watching", "thank you for watching", "thank you", "thanks", @@ -103,9 +88,9 @@ }, "speaker": { - "_comment": "G1 on-board speaker parameters. dds_interface is the robot's DDS NIC; app_name is the stream label used by AudioClient.PlayStream.", + "_comment": "G1 on-board speaker parameters. dds_interface is the robot's DDS NIC; app_name is the stream label used by AudioClient.PlayStream. volume is 0-100; lowered from 100 to 70 because the on-board mic picks up the on-board speaker's echo strongly enough to feed Gemini Live a self-loop at full volume — see the gemini_barge_in tunings.", "dds_interface": "eth0", - "volume": 100, + "volume": 70, "app_name": "sanad", "begin_stream_pause_sec": 0.15, "wait_finish_margin_sec": 0.3 diff --git a/Config/config_gemini_action.json b/Config/config_gemini_action.json deleted file mode 100644 index 6e197ea..0000000 --- a/Config/config_gemini_action.json +++ /dev/null @@ -1,250 +0,0 @@ -{ - "_description": "Gemini action dispatch — maps spoken phrases to canonical motion commands. Mirrors Sanad's scripts/sanad_arm.txt pattern (Project/Sanad/scripts/sanad_arm.txt) but in JSON with action groups instead of a Python-set file. When stt.backend='gemini', Voice/marcus_voice.py::_dispatch_gemini_command matches the user's transcript (after stripping 'Sanad') against 'phrases' under each action and fires on_command with the action's 'canonical' string. Edit this file to add new spoken variants WITHOUT touching code.", - "_format": "actions..phrases — array of spoken variants (lowercase, punctuation stripped). Match is whole-word, case-insensitive. One phrase hit = fire.\nactions..canonical — the string passed to self._on_command(text, 'en'). Must be a recognised command in Brain/command_parser.py.\nactions..description — human-only; dispatcher ignores it.\nNon-motion conversation ('how are you', 'who are you', 'what do you see') is NOT listed here — Gemini answers those naturally via voice. Only physical actions live in this file.", - - "settings": { - "_comment": "Dispatcher behaviour. require_wake_word=true means the transcript must contain 'Sanad' (or a fuzzy variant from stt.wake_words) before any phrase is considered — matches the current Marcus persona rule. fire_on_wake_match=true fires the action instantly on transcript; false defers until Gemini's turn_complete (robot speaks the acknowledgement first, then moves) — mirrors Sanad's fire_on_wake_match flag in voice/text_utils.maybe_trigger_arm.", - "trigger_enabled": true, - "require_wake_word": true, - "fire_on_wake_match": true, - "stream_buffer_sec": 2.0, - "dedup_window_sec": 2.0, - "repeat_suppress_sec": 0.25, - "pending_action_ttl_sec": 6.0 - }, - - "actions": { - "turn_left": { - "canonical": "turn left", - "description": "Rotate in place 90° to the left.", - "phrases": [ - "turn left", - "rotate left", - "spin left", - "go left", - "face left" - ] - }, - "turn_right": { - "canonical": "turn right", - "description": "Rotate in place 90° to the right.", - "phrases": [ - "turn right", - "rotate right", - "spin right", - "go right", - "face right" - ] - }, - "turn_around": { - "canonical": "turn around", - "description": "Rotate 180°.", - "phrases": [ - "turn around", - "turn back", - "spin around", - "about face", - "face the other way" - ] - }, - - "move_forward": { - "canonical": "move forward", - "description": "Walk forward one step interval.", - "phrases": [ - "move forward", - "go forward", - "walk forward", - "step forward", - "forward", - "keep going", - "walk ahead" - ] - }, - "move_back": { - "canonical": "move backward", - "description": "Walk backward one step interval.", - "phrases": [ - "move back", - "move backward", - "go back", - "go backward", - "walk back", - "walk backward", - "step back", - "backward", - "reverse" - ] - }, - "step_left": { - "canonical": "move left", - "description": "Sidestep left.", - "phrases": [ - "step left", - "move left", - "slide left", - "strafe left" - ] - }, - "step_right": { - "canonical": "move right", - "description": "Sidestep right.", - "phrases": [ - "step right", - "move right", - "slide right", - "strafe right" - ] - }, - - "stop": { - "canonical": "stop", - "description": "Halt current motion immediately.", - "phrases": [ - "stop", - "halt", - "wait", - "pause", - "freeze", - "hold", - "stop moving", - "stand still", - "don't move" - ] - }, - - "sit_down": { - "canonical": "sit down", - "description": "Sit down to the ground from standing.", - "phrases": [ - "sit down", - "sit", - "take a seat", - "have a seat" - ] - }, - "stand_up": { - "canonical": "stand up", - "description": "Stand up from sitting.", - "phrases": [ - "stand up", - "stand", - "get up", - "rise" - ] - }, - - "wave_hello": { - "canonical": "wave hello", - "description": "Wave with the right arm.", - "phrases": [ - "wave hello", - "wave", - "say hi", - "greet", - "wave to me", - "wave at me" - ] - }, - "raise_arm": { - "canonical": "raise arm", - "description": "Raise the right arm straight up.", - "phrases": [ - "raise arm", - "raise your arm", - "lift your arm", - "arm up", - "hand up" - ] - }, - "lower_arm": { - "canonical": "lower arm", - "description": "Return the arm to the resting position.", - "phrases": [ - "lower arm", - "lower your arm", - "drop your arm", - "arm down", - "hand down", - "rest your arm" - ] - }, - "point": { - "canonical": "point", - "description": "Point with the right arm (used after 'look at ...').", - "phrases": [ - "point", - "point at it", - "point to it", - "point there" - ] - }, - - "come_here": { - "canonical": "come here", - "description": "Approach the speaker.", - "phrases": [ - "come here", - "come to me", - "come closer", - "approach", - "get closer", - "come over here" - ] - }, - "follow_me": { - "canonical": "follow me", - "description": "Follow the speaker until told to stop.", - "phrases": [ - "follow me", - "come with me", - "walk with me" - ] - }, - "stay_here": { - "canonical": "stay here", - "description": "Stop following and hold position.", - "phrases": [ - "stay here", - "stay", - "wait here", - "hold position", - "don't follow me" - ] - }, - "go_home": { - "canonical": "go home", - "description": "Return to the home position.", - "phrases": [ - "go home", - "return home", - "head home", - "go back home" - ] - }, - - "patrol": { - "canonical": "patrol", - "description": "Start the patrol routine.", - "phrases": [ - "patrol", - "start patrol", - "begin patrol", - "patrol the area", - "walk the route" - ] - }, - - "look_around": { - "canonical": "look around", - "description": "Scan the environment (vision sweep).", - "phrases": [ - "look around", - "scan the room", - "scan around", - "survey the area", - "have a look around" - ] - } - } -} diff --git a/Config/instruction.json b/Config/instruction.json new file mode 100644 index 0000000..82ae8dc --- /dev/null +++ b/Config/instruction.json @@ -0,0 +1,277 @@ +{ + "_description": "Bilingual voice command instructions — single source of truth for the voice dispatch tables. Loaded by Voice/marcus_voice.py at module level. Adding a new motion command, a new accent variant, a new Arabic phrasing, or fixing a misheard wake-word transcription is a JSON-only edit; no Python change required.", + + "_format": "wake_words = whole-word substrings the dispatch gate looks for in the user's transcript. Any match (English or Arabic) opens motion for the current turn. actions = per-motion phrase tables. Each action has a `canonical` string (what marcus_brain receives), `user_phrases` (what the user might SAY when asking for the motion — used for fuzzy-match + Arabic-to-English translation after wake-word strip), and `bot_phrases` (what Gemini might SPEAK when acknowledging — used by the bot-side dispatcher to fire motion off Gemini's own confirmation). All matching is substring-based; English entries are matched case-insensitively, Arabic entries match as-is. Keep the canonical string consistent with Brain/command_parser.py vocabulary.", + + "wake_words": { + "_comment": "All variants of the robot's name 'Sanad' — the gate that authorises motion. Add new mishearings here when you see them in logs/transcript.log under HEARD lines that should have triggered motion but didn't.", + "english": [ + "sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad", + "sanat", "sunnat", "sonnat", "sinnat", "sennat", + "sanid", "sanud", "saned", "sanod", "sanaad", + "senad", "sinad", "sonad", "sunad", + "sanah", "sanath", "sanadh", "sonadh", + "samad", "somad", "sumad", + "thanad", "zanad", + "sa nad", "san ad", "san odd", "san add" + ], + "arabic": [ + "سند", "سنّاد", "ساند", "سنود", "سنَد", "سنّد", "سَند", + "يا سند", "يا سنّاد", "يا ساند", "يا سَند" + ] + }, + + "actions": { + "turn_right": { + "canonical": "turn right", + "user_phrases": { + "english": ["turn right", "rotate right", "spin right", "go right", "face right", "right"], + "arabic": ["استدر يميناً", "استدر يمينا", "ادر يميناً", "ادر يمينا", "لف يمين", "لف يميناً", "يمين"] + }, + "bot_phrases": { + "english": ["turning right"], + "arabic": ["أستدير يميناً", "أستدير يمينا", "استدير يميناً", "استدير يمينا", "ألف يميناً", "ألف يمينا"] + } + }, + + "turn_left": { + "canonical": "turn left", + "user_phrases": { + "english": ["turn left", "rotate left", "spin left", "go left", "face left", "left"], + "arabic": ["استدر يساراً", "استدر يسارا", "ادر يساراً", "ادر يسارا", "لف يسار", "لف يساراً", "يسار", "شمال"] + }, + "bot_phrases": { + "english": ["turning left"], + "arabic": ["أستدير يساراً", "أستدير يسارا", "استدير يساراً", "استدير يسارا", "ألف يساراً", "ألف يسارا"] + } + }, + + "turn_around": { + "canonical": "turn around", + "user_phrases": { + "english": ["turn around", "turn back", "spin around", "about face", "face the other way"], + "arabic": ["استدر للخلف", "استدر إلى الوراء", "اتجه للخلف", "ادر للخلف", "ارجع وجهك"] + }, + "bot_phrases": { + "english": ["turning around"], + "arabic": ["أستدير للخلف", "أستدير إلى الوراء", "استدير للخلف"] + } + }, + + "move_forward": { + "canonical": "move forward", + "user_phrases": { + "english": ["move forward", "go forward", "walk forward", "step forward", "forward", "keep going", "walk ahead", "move ahead"], + "arabic": ["تحرك للأمام", "تحرك إلى الأمام", "اذهب للأمام", "امش للأمام", "تقدم", "للأمام", "أمام"] + }, + "bot_phrases": { + "english": ["moving forward", "walking forward", "stepping forward", "going forward", "going ahead"], + "arabic": ["أتحرك للأمام", "أتحرك إلى الأمام", "أتقدم", "أمشي للأمام", "أذهب للأمام"] + } + }, + + "move_backward": { + "canonical": "move backward", + "user_phrases": { + "english": ["move back", "move backward", "go back", "go backward", "walk back", "walk backward", "step back", "backward", "back", "reverse"], + "arabic": ["تحرك للخلف", "تحرك إلى الخلف", "اذهب للخلف", "امش للخلف", "ارجع", "ارجع للخلف", "للخلف", "خلف"] + }, + "bot_phrases": { + "english": ["moving backward", "moving back", "walking backward", "walking back", "stepping back", "going back"], + "arabic": ["أتحرك للخلف", "أتحرك إلى الخلف", "أرجع", "أمشي للخلف", "أعود للخلف"] + } + }, + + "move_right": { + "canonical": "move right", + "user_phrases": { + "english": ["step right", "move right", "slide right", "strafe right", "sidestep right"], + "arabic": ["تحرك يميناً", "تحرك يمينا", "خطوة يمين", "اتجه يميناً"] + }, + "bot_phrases": { + "english": ["moving right", "stepping right", "sliding right"], + "arabic": ["أتحرك يميناً", "أتحرك يمينا", "أخطو يميناً"] + } + }, + + "move_left": { + "canonical": "move left", + "user_phrases": { + "english": ["step left", "move left", "slide left", "strafe left", "sidestep left"], + "arabic": ["تحرك يساراً", "تحرك يسارا", "خطوة يسار", "اتجه يساراً"] + }, + "bot_phrases": { + "english": ["moving left", "stepping left", "sliding left"], + "arabic": ["أتحرك يساراً", "أتحرك يسارا", "أخطو يساراً"] + } + }, + + "stop": { + "canonical": "stop", + "user_phrases": { + "english": ["stop", "halt", "wait", "pause", "freeze", "hold", "stop moving", "stand still", "don't move"], + "arabic": ["توقف", "قف مكانك", "اوقف", "انتظر", "اثبت", "لا تتحرك"] + }, + "bot_phrases": { + "english": ["stopping", "halting", "holding"], + "arabic": ["أتوقف", "توقفت", "أنتظر"] + } + }, + + "sit_down": { + "canonical": "sit down", + "user_phrases": { + "english": ["sit down", "sit", "take a seat", "have a seat"], + "arabic": ["اجلس", "ارتح", "اقعد"] + }, + "bot_phrases": { + "english": ["sitting down", "sitting"], + "arabic": ["أجلس", "أقعد", "جلست"] + } + }, + + "stand_up": { + "canonical": "stand up", + "user_phrases": { + "english": ["stand up", "stand", "get up", "rise"], + "arabic": ["قف", "انهض", "ارفع نفسك"] + }, + "bot_phrases": { + "english": ["standing up", "getting up", "rising"], + "arabic": ["أقف", "أنهض", "وقفت"] + } + }, + + "wave_hello": { + "canonical": "wave hello", + "user_phrases": { + "english": ["wave hello", "wave", "say hi", "greet", "wave to me", "wave at me"], + "arabic": ["لوّح", "لوح", "لوّح بيدك", "حيّ", "سلّم"] + }, + "bot_phrases": { + "english": ["waving hello", "waving", "saying hi", "greeting"], + "arabic": ["ألوّح", "ألوح", "ألوّح بيدي", "أحيّ", "أسلّم"] + } + }, + + "raise_arm": { + "canonical": "raise arm", + "user_phrases": { + "english": ["raise arm", "raise your arm", "lift your arm", "arm up", "hand up"], + "arabic": ["ارفع يدك", "ارفع ذراعك", "اليد للأعلى"] + }, + "bot_phrases": { + "english": ["raising arm", "raising my arm", "lifting my arm", "arm up"], + "arabic": ["أرفع يدي", "أرفع ذراعي"] + } + }, + + "lower_arm": { + "canonical": "lower arm", + "user_phrases": { + "english": ["lower arm", "lower your arm", "drop your arm", "arm down", "hand down", "rest your arm"], + "arabic": ["اخفض يدك", "اخفض ذراعك", "اليد للأسفل", "نزل يدك"] + }, + "bot_phrases": { + "english": ["lowering arm", "lowering my arm", "dropping my arm", "arm down"], + "arabic": ["أخفض يدي", "أخفض ذراعي", "أنزل يدي"] + } + }, + + "point": { + "canonical": "point", + "user_phrases": { + "english": ["point", "point at it", "point to it", "point there"], + "arabic": ["اشر", "أشِر", "اشر إلى", "اشر هناك"] + }, + "bot_phrases": { + "english": ["pointing"], + "arabic": ["أشير"] + } + }, + + "come_here": { + "canonical": "come here", + "user_phrases": { + "english": ["come here", "come to me", "come closer", "approach", "get closer", "come over here", "come"], + "arabic": ["تعال", "تعال هنا", "تعال إليّ", "اقترب", "تقرب"] + }, + "bot_phrases": { + "english": ["coming over", "coming to you", "approaching"], + "arabic": ["آتي إليك", "أقترب", "أتقرّب"] + } + }, + + "follow_me": { + "canonical": "follow me", + "user_phrases": { + "english": ["follow me", "come with me", "walk with me"], + "arabic": ["اتبعني", "تعال معي", "امش معي"] + }, + "bot_phrases": { + "english": ["following you", "following", "coming with you"], + "arabic": ["أتبعك", "آتي معك", "أمشي معك"] + } + }, + + "stay_here": { + "canonical": "stay here", + "user_phrases": { + "english": ["stay here", "stay", "wait here", "hold position", "don't follow me"], + "arabic": ["ابق هنا", "اثبت هنا", "انتظر هنا", "لا تتبعني"] + }, + "bot_phrases": { + "english": ["staying here", "staying", "waiting here"], + "arabic": ["أبقى هنا", "أنتظر هنا", "أثبت هنا"] + } + }, + + "go_home": { + "canonical": "go home", + "user_phrases": { + "english": ["go home", "return home", "head home", "go back home"], + "arabic": ["اذهب للبيت", "اذهب إلى البيت", "ارجع للبيت", "عُد للبيت"] + }, + "bot_phrases": { + "english": ["going home", "heading home", "returning home"], + "arabic": ["أعود للبيت", "أذهب للبيت", "أتجه للبيت"] + } + }, + + "patrol": { + "canonical": "patrol", + "user_phrases": { + "english": ["patrol", "start patrol", "begin patrol", "patrol the area", "walk the route"], + "arabic": ["طوف", "ابدأ الدورية", "ابدأ التطواف", "افحص المكان"] + }, + "bot_phrases": { + "english": ["patrolling", "starting patrol", "beginning patrol"], + "arabic": ["أطوف", "أبدأ الدورية", "أبدأ التطواف"] + } + }, + + "look_around": { + "canonical": "look around", + "user_phrases": { + "english": ["look around", "scan the room", "scan around", "survey the area", "have a look around"], + "arabic": ["انظر حولك", "تفحص المكان", "افحص المكان", "تطلع حولك"] + }, + "bot_phrases": { + "english": ["looking around", "scanning around", "surveying the area"], + "arabic": ["أنظر حولي", "أتفحص المكان", "أتطلع حولي"] + } + }, + + "what_do_you_see": { + "canonical": "what do you see", + "user_phrases": { + "english": ["what do you see", "what can you see", "describe this", "describe what you see", "tell me what you see"], + "arabic": ["ماذا ترى", "ماذا تشاهد", "صف ما تراه", "أخبرني ماذا ترى"] + }, + "bot_phrases": { + "english": [], + "arabic": [] + } + } + } +} diff --git a/Data/Brain/Sessions/session_001_2026-04-27/alerts.json b/Data/Brain/Sessions/session_001_2026-04-27/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_001_2026-04-27/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-27/commands.json b/Data/Brain/Sessions/session_001_2026-04-27/commands.json new file mode 100644 index 0000000..ca8071d --- /dev/null +++ b/Data/Brain/Sessions/session_001_2026-04-27/commands.json @@ -0,0 +1,128 @@ +[ + { + "time": "08:59:54", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:00:04", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:00:48", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:02:00", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:02:07", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:02:26", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:03:37", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:03:53", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:04:00", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:04:38", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:04:47", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:05:09", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:05:18", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:05:21", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:05:24", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:05:31", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:05:38", + "cmd": "turn left", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:06:40", + "cmd": "move forward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:07:48", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:07:57", + "cmd": "move backward", + "response": "local command", + "duration_s": 0.0 + }, + { + "time": "09:09:04", + "cmd": "turn right", + "response": "local command", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-27/detections.json b/Data/Brain/Sessions/session_001_2026-04-27/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_001_2026-04-27/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_001_2026-04-27/places.json b/Data/Brain/Sessions/session_001_2026-04-27/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_001_2026-04-27/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Voice/Recordings/gemini_turns/20260427_085937_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_085937_robot.wav new file mode 100644 index 0000000..a1f47d0 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_085937_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_085937_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_085937_user.wav new file mode 100644 index 0000000..659f25c Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_085937_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_085952_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_085952_robot.wav new file mode 100644 index 0000000..d53c9b5 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_085952_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_085952_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_085952_user.wav new file mode 100644 index 0000000..b34d041 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_085952_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090002_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090002_user.wav new file mode 100644 index 0000000..4b227f2 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090002_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090003_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090003_robot.wav new file mode 100644 index 0000000..9e9b537 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090003_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090003_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090003_user.wav new file mode 100644 index 0000000..9ce9bf4 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090003_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090020_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090020_robot.wav new file mode 100644 index 0000000..969b3c2 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090020_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090020_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090020_user.wav new file mode 100644 index 0000000..0bd8bbb Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090020_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090042_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090042_robot.wav new file mode 100644 index 0000000..cd54382 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090042_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090042_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090042_user.wav new file mode 100644 index 0000000..2734d62 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090042_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090047_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090047_user.wav new file mode 100644 index 0000000..261aa2a Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090047_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090157_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090157_robot.wav new file mode 100644 index 0000000..7772c7d Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090157_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090157_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090157_user.wav new file mode 100644 index 0000000..7b3fda3 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090157_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090206_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090206_robot.wav new file mode 100644 index 0000000..935dbd6 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090206_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090206_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090206_user.wav new file mode 100644 index 0000000..5c8c670 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090206_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090224_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090224_robot.wav new file mode 100644 index 0000000..bdcc420 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090224_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090224_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090224_user.wav new file mode 100644 index 0000000..12a034d Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090224_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090336_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090336_robot.wav new file mode 100644 index 0000000..5ba5242 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090336_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090336_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090336_user.wav new file mode 100644 index 0000000..2b3668c Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090336_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090346_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090346_robot.wav new file mode 100644 index 0000000..7ea6656 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090346_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090346_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090346_user.wav new file mode 100644 index 0000000..bddc513 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090346_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090352_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090352_robot.wav new file mode 100644 index 0000000..03f5045 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090352_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090352_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090352_user.wav new file mode 100644 index 0000000..e13e5fe Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090352_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090359_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090359_robot.wav new file mode 100644 index 0000000..b0a4606 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090359_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090359_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090359_user.wav new file mode 100644 index 0000000..8c6bba6 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090359_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090412_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090412_robot.wav new file mode 100644 index 0000000..01c89c0 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090412_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090412_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090412_user.wav new file mode 100644 index 0000000..94a02b5 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090412_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090419_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090419_robot.wav new file mode 100644 index 0000000..605f668 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090419_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090419_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090419_user.wav new file mode 100644 index 0000000..b55084e Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090419_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090437_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090437_robot.wav new file mode 100644 index 0000000..ad32348 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090437_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090437_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090437_user.wav new file mode 100644 index 0000000..a01e24d Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090437_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090446_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090446_robot.wav new file mode 100644 index 0000000..a8c8cc7 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090446_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090446_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090446_user.wav new file mode 100644 index 0000000..66a6106 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090446_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090459_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090459_robot.wav new file mode 100644 index 0000000..34d6da2 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090459_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090459_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090459_user.wav new file mode 100644 index 0000000..9d2f975 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090459_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090508_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090508_robot.wav new file mode 100644 index 0000000..cadd4b6 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090508_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090508_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090508_user.wav new file mode 100644 index 0000000..afca174 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090508_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090516_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090516_robot.wav new file mode 100644 index 0000000..1a29a76 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090516_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090516_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090516_user.wav new file mode 100644 index 0000000..296406a Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090516_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090519_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090519_robot.wav new file mode 100644 index 0000000..c260de6 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090519_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090519_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090519_user.wav new file mode 100644 index 0000000..32f6217 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090519_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090522_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090522_user.wav new file mode 100644 index 0000000..649e583 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090522_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090523_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090523_robot.wav new file mode 100644 index 0000000..d688a9f Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090523_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090523_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090523_user.wav new file mode 100644 index 0000000..bafd787 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090523_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090530_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090530_robot.wav new file mode 100644 index 0000000..e672fd3 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090530_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090530_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090530_user.wav new file mode 100644 index 0000000..0e7503a Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090530_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090537_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090537_robot.wav new file mode 100644 index 0000000..60db1ce Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090537_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090537_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090537_user.wav new file mode 100644 index 0000000..1a7bef3 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090537_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090552_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090552_robot.wav new file mode 100644 index 0000000..2e03034 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090552_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090552_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090552_user.wav new file mode 100644 index 0000000..b77ee6e Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090552_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090601_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090601_user.wav new file mode 100644 index 0000000..ad0929a Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090601_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090607_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090607_user.wav new file mode 100644 index 0000000..72266d0 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090607_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090610_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090610_robot.wav new file mode 100644 index 0000000..8a1c24e Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090610_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090610_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090610_user.wav new file mode 100644 index 0000000..59c9b75 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090610_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090616_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090616_robot.wav new file mode 100644 index 0000000..1cea3fe Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090616_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090616_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090616_user.wav new file mode 100644 index 0000000..58b05ea Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090616_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090632_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090632_robot.wav new file mode 100644 index 0000000..689251a Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090632_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090632_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090632_user.wav new file mode 100644 index 0000000..09a4623 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090632_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090640_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090640_user.wav new file mode 100644 index 0000000..16b341e Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090640_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090656_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090656_user.wav new file mode 100644 index 0000000..44cfc09 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090656_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090702_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090702_user.wav new file mode 100644 index 0000000..efefafb Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090702_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090722_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090722_robot.wav new file mode 100644 index 0000000..d1787f6 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090722_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090722_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090722_user.wav new file mode 100644 index 0000000..aad5110 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090722_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090728_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090728_robot.wav new file mode 100644 index 0000000..1bee4f3 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090728_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090728_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090728_user.wav new file mode 100644 index 0000000..c7b4a8b Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090728_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090729_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090729_robot.wav new file mode 100644 index 0000000..10e8cd4 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090729_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090729_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090729_user.wav new file mode 100644 index 0000000..fe18495 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090729_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090732_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090732_user.wav new file mode 100644 index 0000000..e613c61 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090732_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090733_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090733_robot.wav new file mode 100644 index 0000000..2a9591b Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090733_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090733_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090733_user.wav new file mode 100644 index 0000000..37cc213 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090733_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090748_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090748_robot.wav new file mode 100644 index 0000000..cfc975e Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090748_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090748_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090748_user.wav new file mode 100644 index 0000000..b617d9d Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090748_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090757_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090757_user.wav new file mode 100644 index 0000000..3614d58 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090757_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090805_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090805_robot.wav new file mode 100644 index 0000000..fed7e1e Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090805_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090805_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090805_user.wav new file mode 100644 index 0000000..795ed8c Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090805_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090811_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090811_user.wav new file mode 100644 index 0000000..bb23875 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090811_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090812_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090812_user.wav new file mode 100644 index 0000000..fe72071 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090812_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090830_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090830_user.wav new file mode 100644 index 0000000..1be1150 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090830_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090835_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090835_robot.wav new file mode 100644 index 0000000..6323133 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090835_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090835_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090835_user.wav new file mode 100644 index 0000000..7c0f5da Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090835_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090842_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090842_user.wav new file mode 100644 index 0000000..67dd89d Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090842_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090843_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090843_user.wav new file mode 100644 index 0000000..dc2e543 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090843_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090847_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090847_robot.wav new file mode 100644 index 0000000..4603336 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090847_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090847_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090847_user.wav new file mode 100644 index 0000000..9eba478 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090847_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090856_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090856_robot.wav new file mode 100644 index 0000000..da4f210 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090856_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090856_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090856_user.wav new file mode 100644 index 0000000..4b59ae6 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090856_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090904_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090904_user.wav new file mode 100644 index 0000000..e7e9b5f Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090904_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090910_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090910_robot.wav new file mode 100644 index 0000000..81259e2 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090910_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090910_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090910_user.wav new file mode 100644 index 0000000..c9a54fa Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090910_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090915_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090915_robot.wav new file mode 100644 index 0000000..5f574fd Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090915_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090915_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090915_user.wav new file mode 100644 index 0000000..472c188 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090915_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090919_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090919_robot.wav new file mode 100644 index 0000000..6980519 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090919_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090919_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090919_user.wav new file mode 100644 index 0000000..9161c7a Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090919_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090927_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090927_user.wav new file mode 100644 index 0000000..f47cafd Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090927_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090928_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090928_robot.wav new file mode 100644 index 0000000..40efa2e Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090928_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090928_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090928_user.wav new file mode 100644 index 0000000..e97660e Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090928_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090940_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090940_robot.wav new file mode 100644 index 0000000..3b8cf67 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090940_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090940_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090940_user.wav new file mode 100644 index 0000000..68f002c Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090940_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090941_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090941_user.wav new file mode 100644 index 0000000..c637c3b Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090941_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090946_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090946_robot.wav new file mode 100644 index 0000000..638f67a Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090946_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090946_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090946_user.wav new file mode 100644 index 0000000..d333e9f Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090946_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090957_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_090957_robot.wav new file mode 100644 index 0000000..71b6700 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090957_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090957_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090957_user.wav new file mode 100644 index 0000000..f055312 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090957_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_090958_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_090958_user.wav new file mode 100644 index 0000000..b145284 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_090958_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091001_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091001_robot.wav new file mode 100644 index 0000000..f0cf406 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091001_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091001_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091001_user.wav new file mode 100644 index 0000000..575cbff Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091001_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091016_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091016_robot.wav new file mode 100644 index 0000000..fe959ce Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091016_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091016_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091016_user.wav new file mode 100644 index 0000000..cd1ccf6 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091016_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091017_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091017_robot.wav new file mode 100644 index 0000000..03e5c81 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091017_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091017_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091017_user.wav new file mode 100644 index 0000000..1438878 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091017_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091018_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091018_user.wav new file mode 100644 index 0000000..13fb5bb Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091018_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091022_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091022_user.wav new file mode 100644 index 0000000..2cd6813 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091022_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091031_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091031_user.wav new file mode 100644 index 0000000..936cd19 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091031_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091033_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091033_user.wav new file mode 100644 index 0000000..f33b72a Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091033_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091036_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091036_user.wav new file mode 100644 index 0000000..44915d4 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091036_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091043_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091043_user.wav new file mode 100644 index 0000000..c419b12 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091043_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091049_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091049_user.wav new file mode 100644 index 0000000..4acd2f7 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091049_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091051_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091051_user.wav new file mode 100644 index 0000000..9775d6f Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091051_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091054_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091054_robot.wav new file mode 100644 index 0000000..5554731 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091054_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091054_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091054_user.wav new file mode 100644 index 0000000..78a3160 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091054_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091058_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091058_robot.wav new file mode 100644 index 0000000..699e000 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091058_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091058_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091058_user.wav new file mode 100644 index 0000000..a6e6af5 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091058_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091059_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091059_robot.wav new file mode 100644 index 0000000..5c73dfd Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091059_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091059_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091059_user.wav new file mode 100644 index 0000000..e728f7d Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091059_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091100_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091100_robot.wav new file mode 100644 index 0000000..495d94f Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091100_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091100_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091100_user.wav new file mode 100644 index 0000000..6040355 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091100_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091104_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091104_user.wav new file mode 100644 index 0000000..4db9f0d Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091104_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091114_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091114_robot.wav new file mode 100644 index 0000000..81c1465 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091114_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091114_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091114_user.wav new file mode 100644 index 0000000..7129090 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091114_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091121_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091121_robot.wav new file mode 100644 index 0000000..ef811a3 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091121_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091121_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091121_user.wav new file mode 100644 index 0000000..f5d13ad Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091121_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091122_robot.wav b/Data/Voice/Recordings/gemini_turns/20260427_091122_robot.wav new file mode 100644 index 0000000..c956d26 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091122_robot.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091122_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091122_user.wav new file mode 100644 index 0000000..41d9620 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091122_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/20260427_091123_user.wav b/Data/Voice/Recordings/gemini_turns/20260427_091123_user.wav new file mode 100644 index 0000000..0f91db9 Binary files /dev/null and b/Data/Voice/Recordings/gemini_turns/20260427_091123_user.wav differ diff --git a/Data/Voice/Recordings/gemini_turns/index.json b/Data/Voice/Recordings/gemini_turns/index.json new file mode 100644 index 0000000..5c52b54 --- /dev/null +++ b/Data/Voice/Recordings/gemini_turns/index.json @@ -0,0 +1,887 @@ +{ + "records": [ + { + "timestamp": "20260427_085937", + "started_at": 1777251577.9944012, + "user_text": "", + "robot_text": "Turning right.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_085937_user.wav", + "user_duration_sec": 1.056, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_085937_robot.wav", + "robot_duration_sec": 0.88 + }, + { + "timestamp": "20260427_085952", + "started_at": 1777251592.5528038, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_085952_user.wav", + "user_duration_sec": 0.128 + }, + { + "timestamp": "20260427_085952", + "started_at": 1777251592.7480435, + "user_text": "", + "robot_text": "Turning left.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_085952_user.wav", + "user_duration_sec": 2.112, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_085952_robot.wav", + "robot_duration_sec": 0.96 + }, + { + "timestamp": "20260427_090002", + "started_at": 1777251602.7470634, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090002_user.wav", + "user_duration_sec": 0.032 + }, + { + "timestamp": "20260427_090003", + "started_at": 1777251603.1536026, + "user_text": "What do you think?", + "robot_text": "I see a black and grey backpack resting on a black container. To the left of that, there's a cardboard box with a blue package on top, all against a frosted glass wall.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090003_user.wav", + "user_duration_sec": 1.216, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090003_robot.wav", + "robot_duration_sec": 9.88 + }, + { + "timestamp": "20260427_090020", + "started_at": 1777251620.2809963, + "user_text": "", + "robot_text": "Sanad, turn left.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090020_user.wav", + "user_duration_sec": 2.656, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090020_robot.wav", + "robot_duration_sec": 1.48 + }, + { + "timestamp": "20260427_090042", + "started_at": 1777251642.4523458, + "user_text": "Non.", + "robot_text": "Turning left.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090042_user.wav", + "user_duration_sec": 0.896, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090042_robot.wav", + "robot_duration_sec": 1.0 + }, + { + "timestamp": "20260427_090047", + "started_at": 1777251647.9008353, + "user_text": "", + "robot_text": "Turning right.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090047_user.wav", + "user_duration_sec": 7.232 + }, + { + "timestamp": "20260427_090157", + "started_at": 1777251717.7036119, + "user_text": "อ", + "robot_text": "Turning right.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090157_user.wav", + "user_duration_sec": 1.44, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090157_robot.wav", + "robot_duration_sec": 0.88 + }, + { + "timestamp": "20260427_090206", + "started_at": 1777251726.217288, + "user_text": ".", + "robot_text": "Turning right.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090206_user.wav", + "user_duration_sec": 2.176, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090206_robot.wav", + "robot_duration_sec": 0.84 + }, + { + "timestamp": "20260427_090224", + "started_at": 1777251744.4927664, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090224_user.wav", + "user_duration_sec": 0.128 + }, + { + "timestamp": "20260427_090224", + "started_at": 1777251744.9639776, + "user_text": "แล้ว", + "robot_text": "Turning left.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090224_user.wav", + "user_duration_sec": 12.48, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090224_robot.wav", + "robot_duration_sec": 0.92 + }, + { + "timestamp": "20260427_090336", + "started_at": 1777251816.1296601, + "user_text": "Come", + "robot_text": "Okay. What's next?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090336_user.wav", + "user_duration_sec": 1.248, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090336_robot.wav", + "robot_duration_sec": 1.32 + }, + { + "timestamp": "20260427_090346", + "started_at": 1777251826.7168398, + "user_text": ".", + "robot_text": "Moving forward.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090346_user.wav", + "user_duration_sec": 0.992, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090346_robot.wav", + "robot_duration_sec": 1.04 + }, + { + "timestamp": "20260427_090352", + "started_at": 1777251832.4164839, + "user_text": "Do.", + "robot_text": "Moving backward.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090352_user.wav", + "user_duration_sec": 0.928, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090352_robot.wav", + "robot_duration_sec": 1.0 + }, + { + "timestamp": "20260427_090359", + "started_at": 1777251839.9720776, + "user_text": "What do you see?", + "robot_text": "I see a blurred silhouette of a person through the frosted glass wall.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090359_user.wav", + "user_duration_sec": 1.056, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090359_robot.wav", + "robot_duration_sec": 4.28 + }, + { + "timestamp": "20260427_090412", + "started_at": 1777251852.6062336, + "user_text": "What did he doing?", + "robot_text": "The silhouette looks like they are standing still.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090412_user.wav", + "user_duration_sec": 0.864, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090412_robot.wav", + "robot_duration_sec": 3.08 + }, + { + "timestamp": "20260427_090419", + "started_at": 1777251859.8165183, + "user_text": "", + "robot_text": "Turning left.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090419_user.wav", + "user_duration_sec": 1.888, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090419_robot.wav", + "robot_duration_sec": 1.0 + }, + { + "timestamp": "20260427_090437", + "started_at": 1777251877.9682913, + "user_text": "", + "robot_text": "Turning right.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090437_user.wav", + "user_duration_sec": 1.12, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090437_robot.wav", + "robot_duration_sec": 0.84 + }, + { + "timestamp": "20260427_090446", + "started_at": 1777251886.566751, + "user_text": "", + "robot_text": "I see a person wearing a maroon t-shirt and jeans, holding a phone.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090446_user.wav", + "user_duration_sec": 1.408, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090446_robot.wav", + "robot_duration_sec": 4.44 + }, + { + "timestamp": "20260427_090459", + "started_at": 1777251899.2506564, + "user_text": "", + "robot_text": "Turning right.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090459_user.wav", + "user_duration_sec": 1.536, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090459_robot.wav", + "robot_duration_sec": 0.84 + }, + { + "timestamp": "20260427_090508", + "started_at": 1777251908.4730532, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090508_user.wav", + "user_duration_sec": 0.032 + }, + { + "timestamp": "20260427_090508", + "started_at": 1777251908.5165405, + "user_text": "", + "robot_text": "Turning left.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090508_user.wav", + "user_duration_sec": 2.304, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090508_robot.wav", + "robot_duration_sec": 0.96 + }, + { + "timestamp": "20260427_090516", + "started_at": 1777251916.4680145, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090516_user.wav", + "user_duration_sec": 0.16 + }, + { + "timestamp": "20260427_090516", + "started_at": 1777251916.6805983, + "user_text": "", + "robot_text": "Turning right.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090516_user.wav", + "user_duration_sec": 1.056, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090516_robot.wav", + "robot_duration_sec": 0.52 + }, + { + "timestamp": "20260427_090519", + "started_at": 1777251919.4270263, + "user_text": "Nou, een groot. ", + "robot_text": "Moving forward.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090519_user.wav", + "user_duration_sec": 1.312, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090519_robot.wav", + "robot_duration_sec": 1.04 + }, + { + "timestamp": "20260427_090522", + "started_at": 1777251922.3460593, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090522_user.wav", + "user_duration_sec": 0.512 + }, + { + "timestamp": "20260427_090523", + "started_at": 1777251923.1038663, + "user_text": ". Go. Go. Go.", + "robot_text": "Moving backward.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090523_user.wav", + "user_duration_sec": 1.696, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090523_robot.wav", + "robot_duration_sec": 1.08 + }, + { + "timestamp": "20260427_090530", + "started_at": 1777251930.3607652, + "user_text": "The n left one step.", + "robot_text": "Turning left", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090530_user.wav", + "user_duration_sec": 1.44, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090530_robot.wav", + "robot_duration_sec": 0.96 + }, + { + "timestamp": "20260427_090537", + "started_at": 1777251937.6848032, + "user_text": "what do you say now?", + "robot_text": "I see a person standing near a glass door with a metal handle. The person is holding a cup and a phone.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090537_user.wav", + "user_duration_sec": 1.408, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090537_robot.wav", + "robot_duration_sec": 6.64 + }, + { + "timestamp": "20260427_090552", + "started_at": 1777251952.6296594, + "user_text": "Reach the person.", + "robot_text": "Moving towards the person.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090552_user.wav", + "user_duration_sec": 1.024, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090552_robot.wav", + "robot_duration_sec": 1.4 + }, + { + "timestamp": "20260427_090601", + "started_at": 1777251961.8537927, + "user_text": ".", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090601_user.wav", + "user_duration_sec": 1.984 + }, + { + "timestamp": "20260427_090607", + "started_at": 1777251967.7927585, + "user_text": "o la transferencia.", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090607_user.wav", + "user_duration_sec": 0.512 + }, + { + "timestamp": "20260427_090610", + "started_at": 1777251970.1781626, + "user_text": "", + "robot_text": "Okay, I'm here now.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090610_user.wav", + "user_duration_sec": 0.736, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090610_robot.wav", + "robot_duration_sec": 1.4 + }, + { + "timestamp": "20260427_090616", + "started_at": 1777251976.7410944, + "user_text": "Can you see the person in front of you?", + "robot_text": "Yes, I can clearly see them.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090616_user.wav", + "user_duration_sec": 5.376, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090616_robot.wav", + "robot_duration_sec": 2.12 + }, + { + "timestamp": "20260427_090632", + "started_at": 1777251992.7623394, + "user_text": ".", + "robot_text": "Moving forward.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090632_user.wav", + "user_duration_sec": 1.92, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090632_robot.wav", + "robot_duration_sec": 1.08 + }, + { + "timestamp": "20260427_090640", + "started_at": 1777252000.1078405, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090640_user.wav", + "user_duration_sec": 9.504 + }, + { + "timestamp": "20260427_090656", + "started_at": 1777252016.8462038, + "user_text": ".", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090656_user.wav", + "user_duration_sec": 3.36 + }, + { + "timestamp": "20260427_090702", + "started_at": 1777252022.7362535, + "user_text": "", + "robot_text": "Understood. How", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090702_user.wav", + "user_duration_sec": 13.664 + }, + { + "timestamp": "20260427_090722", + "started_at": 1777252042.0906878, + "user_text": "", + "robot_text": "It seems you've come", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090722_user.wav", + "user_duration_sec": 3.648, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090722_robot.wav", + "robot_duration_sec": 1.36 + }, + { + "timestamp": "20260427_090728", + "started_at": 1777252048.2146206, + "user_text": "", + "robot_text": "quite close, I can", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090728_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090728_robot.wav", + "robot_duration_sec": 0.92 + }, + { + "timestamp": "20260427_090728", + "started_at": 1777252048.6483164, + "user_text": "", + "robot_text": "only see the front", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090728_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090728_robot.wav", + "robot_duration_sec": 1.36 + }, + { + "timestamp": "20260427_090729", + "started_at": 1777252049.0833862, + "user_text": "", + "robot_text": "of a black shirt now.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090729_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090729_robot.wav", + "robot_duration_sec": 1.56 + }, + { + "timestamp": "20260427_090732", + "started_at": 1777252052.333645, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090732_user.wav", + "user_duration_sec": 0.544 + }, + { + "timestamp": "20260427_090733", + "started_at": 1777252053.1307166, + "user_text": "Go backwards two steps.", + "robot_text": "Moving backward two steps.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090733_user.wav", + "user_duration_sec": 1.856, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090733_robot.wav", + "robot_duration_sec": 1.76 + }, + { + "timestamp": "20260427_090748", + "started_at": 1777252068.5899441, + "user_text": ".", + "robot_text": "Moving backward three steps.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090748_user.wav", + "user_duration_sec": 2.112, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090748_robot.wav", + "robot_duration_sec": 1.88 + }, + { + "timestamp": "20260427_090757", + "started_at": 1777252077.7019086, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090757_user.wav", + "user_duration_sec": 1.984 + }, + { + "timestamp": "20260427_090805", + "started_at": 1777252085.0313835, + "user_text": "All right . The person that we see is his name is.", + "robot_text": "It's nice to meet you, Omar!", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090805_user.wav", + "user_duration_sec": 2.016, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090805_robot.wav", + "robot_duration_sec": 1.68 + }, + { + "timestamp": "20260427_090811", + "started_at": 1777252091.4707015, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090811_user.wav", + "user_duration_sec": 0.576 + }, + { + "timestamp": "20260427_090812", + "started_at": 1777252092.8878224, + "user_text": ". . ", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090812_user.wav", + "user_duration_sec": 8.352 + }, + { + "timestamp": "20260427_090830", + "started_at": 1777252110.536339, + "user_text": "Yeah.", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090830_user.wav", + "user_duration_sec": 2.368 + }, + { + "timestamp": "20260427_090835", + "started_at": 1777252115.0498679, + "user_text": "46 degree.", + "robot_text": "I'm sorry, I couldn't quite hear you. The audio was a bit clipped. Could you please repeat what you said?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090835_user.wav", + "user_duration_sec": 1.184, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090835_robot.wav", + "robot_duration_sec": 7.4 + }, + { + "timestamp": "20260427_090842", + "started_at": 1777252122.5312247, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090842_user.wav", + "user_duration_sec": 0.992 + }, + { + "timestamp": "20260427_090843", + "started_at": 1777252123.9837608, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090843_user.wav", + "user_duration_sec": 1.6 + }, + { + "timestamp": "20260427_090847", + "started_at": 1777252127.0211418, + "user_text": "pul rotations", + "robot_text": "I'm still having trouble understanding with the distorted audio. Could you please repeat that?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090847_user.wav", + "user_duration_sec": 0.352, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090847_robot.wav", + "robot_duration_sec": 5.6 + }, + { + "timestamp": "20260427_090856", + "started_at": 1777252136.6518312, + "user_text": "180", + "robot_text": "Turning right 180 degrees.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090856_user.wav", + "user_duration_sec": 1.568, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090856_robot.wav", + "robot_duration_sec": 1.88 + }, + { + "timestamp": "20260427_090904", + "started_at": 1777252144.8639987, + "user_text": ".", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090904_user.wav", + "user_duration_sec": 2.784 + }, + { + "timestamp": "20260427_090910", + "started_at": 1777252150.2576299, + "user_text": "Keep sending once.", + "robot_text": "I've completed the 180-degree turn.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090910_user.wav", + "user_duration_sec": 0.256, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090910_robot.wav", + "robot_duration_sec": 1.96 + }, + { + "timestamp": "20260427_090915", + "started_at": 1777252155.5633724, + "user_text": "Make filter.", + "robot_text": "I can't do a \"full turn.\"", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090915_user.wav", + "user_duration_sec": 1.312, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090915_robot.wav", + "robot_duration_sec": 1.32 + }, + { + "timestamp": "20260427_090919", + "started_at": 1777252159.8562431, + "user_text": "", + "robot_text": "I'm sorry, but I'm still having trouble understanding due to the audio. Could you please say that again?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090919_user.wav", + "user_duration_sec": 0.704, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090919_robot.wav", + "robot_duration_sec": 5.72 + }, + { + "timestamp": "20260427_090927", + "started_at": 1777252167.57357, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090927_user.wav", + "user_duration_sec": 0.896 + }, + { + "timestamp": "20260427_090928", + "started_at": 1777252168.82067, + "user_text": "", + "robot_text": "The audio is still distorted, I'm not catching", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090928_user.wav", + "user_duration_sec": 6.464, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090928_robot.wav", + "robot_duration_sec": 3.68 + }, + { + "timestamp": "20260427_090940", + "started_at": 1777252180.1934843, + "user_text": "", + "robot_text": "what you're saying. Could you please speak", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090940_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090940_robot.wav", + "robot_duration_sec": 2.0 + }, + { + "timestamp": "20260427_090940", + "started_at": 1777252180.7317903, + "user_text": "", + "robot_text": "clearly?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090940_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090940_robot.wav", + "robot_duration_sec": 0.76 + }, + { + "timestamp": "20260427_090941", + "started_at": 1777252181.180568, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090941_user.wav", + "user_duration_sec": 3.36 + }, + { + "timestamp": "20260427_090946", + "started_at": 1777252186.123283, + "user_text": "", + "robot_text": "The sound is still very distorted,", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090946_user.wav", + "user_duration_sec": 8.192, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090946_robot.wav", + "robot_duration_sec": 1.96 + }, + { + "timestamp": "20260427_090957", + "started_at": 1777252197.1439726, + "user_text": "", + "robot_text": "I'm afraid. Could", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090957_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090957_robot.wav", + "robot_duration_sec": 1.48 + }, + { + "timestamp": "20260427_090957", + "started_at": 1777252197.572026, + "user_text": "", + "robot_text": "you try again?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090957_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090957_robot.wav", + "robot_duration_sec": 1.52 + }, + { + "timestamp": "20260427_090958", + "started_at": 1777252198.012275, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_090958_user.wav", + "user_duration_sec": 2.016 + }, + { + "timestamp": "20260427_091001", + "started_at": 1777252201.9657176, + "user_text": ".", + "robot_text": "I apologize, but the audio", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091001_user.wav", + "user_duration_sec": 4.864, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091001_robot.wav", + "robot_duration_sec": 1.92 + }, + { + "timestamp": "20260427_091016", + "started_at": 1777252216.6062548, + "user_text": "", + "robot_text": "is still very unclear.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091016_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091016_robot.wav", + "robot_duration_sec": 1.76 + }, + { + "timestamp": "20260427_091017", + "started_at": 1777252217.0914323, + "user_text": "", + "robot_text": "Could you please say", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091017_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091017_robot.wav", + "robot_duration_sec": 1.08 + }, + { + "timestamp": "20260427_091017", + "started_at": 1777252217.528023, + "user_text": "", + "robot_text": "the command again?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091017_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091017_robot.wav", + "robot_duration_sec": 0.92 + }, + { + "timestamp": "20260427_091017", + "started_at": 1777252217.977242, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091017_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091017_robot.wav", + "robot_duration_sec": 0.48 + }, + { + "timestamp": "20260427_091018", + "started_at": 1777252218.4572434, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091018_user.wav", + "user_duration_sec": 2.176 + }, + { + "timestamp": "20260427_091022", + "started_at": 1777252222.3050365, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091022_user.wav", + "user_duration_sec": 3.936 + }, + { + "timestamp": "20260427_091031", + "started_at": 1777252231.3111856, + "user_text": "قصدي. ", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091031_user.wav", + "user_duration_sec": 0.448 + }, + { + "timestamp": "20260427_091033", + "started_at": 1777252233.5816064, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091033_user.wav", + "user_duration_sec": 1.376 + }, + { + "timestamp": "20260427_091036", + "started_at": 1777252236.9071608, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091036_user.wav", + "user_duration_sec": 3.744 + }, + { + "timestamp": "20260427_091043", + "started_at": 1777252243.6780462, + "user_text": ".", + "robot_text": "I'm sorry,", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091043_user.wav", + "user_duration_sec": 2.976 + }, + { + "timestamp": "20260427_091049", + "started_at": 1777252249.3750374, + "user_text": "남쪽 라인 하나 둘 셋", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091049_user.wav", + "user_duration_sec": 0.864 + }, + { + "timestamp": "20260427_091051", + "started_at": 1777252251.6280284, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091051_user.wav", + "user_duration_sec": 1.312 + }, + { + "timestamp": "20260427_091054", + "started_at": 1777252254.4071884, + "user_text": "", + "robot_text": "It seems there's still some distortion.", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091054_user.wav", + "user_duration_sec": 1.568, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091054_robot.wav", + "robot_duration_sec": 1.68 + }, + { + "timestamp": "20260427_091058", + "started_at": 1777252258.6175046, + "user_text": "", + "robot_text": "Could", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091058_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091058_robot.wav", + "robot_duration_sec": 0.88 + }, + { + "timestamp": "20260427_091059", + "started_at": 1777252259.0527377, + "user_text": "", + "robot_text": "you please", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091059_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091059_robot.wav", + "robot_duration_sec": 0.84 + }, + { + "timestamp": "20260427_091059", + "started_at": 1777252259.5382385, + "user_text": "", + "robot_text": "try again,", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091059_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091059_robot.wav", + "robot_duration_sec": 0.8 + }, + { + "timestamp": "20260427_091059", + "started_at": 1777252259.9547665, + "user_text": "", + "robot_text": "maybe with a clearer", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091059_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091059_robot.wav", + "robot_duration_sec": 1.08 + }, + { + "timestamp": "20260427_091100", + "started_at": 1777252260.424161, + "user_text": "", + "robot_text": "voice?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091100_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091100_robot.wav", + "robot_duration_sec": 0.92 + }, + { + "timestamp": "20260427_091100", + "started_at": 1777252260.8556893, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091100_user.wav", + "user_duration_sec": 1.92 + }, + { + "timestamp": "20260427_091104", + "started_at": 1777252264.4879806, + "user_text": ".", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091104_user.wav", + "user_duration_sec": 5.888 + }, + { + "timestamp": "20260427_091114", + "started_at": 1777252274.7428982, + "user_text": ".", + "robot_text": "The audio is still cutting", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091114_user.wav", + "user_duration_sec": 4.16, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091114_robot.wav", + "robot_duration_sec": 1.84 + }, + { + "timestamp": "20260427_091121", + "started_at": 1777252281.7678783, + "user_text": "", + "robot_text": "out. Could you please", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091121_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091121_robot.wav", + "robot_duration_sec": 1.44 + }, + { + "timestamp": "20260427_091122", + "started_at": 1777252282.195701, + "user_text": "", + "robot_text": "say what you'd like me to do?", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091122_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091122_robot.wav", + "robot_duration_sec": 1.52 + }, + { + "timestamp": "20260427_091122", + "started_at": 1777252282.6409268, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091122_user.wav", + "user_duration_sec": 0.032, + "robot_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091122_robot.wav", + "robot_duration_sec": 0.28 + }, + { + "timestamp": "20260427_091123", + "started_at": 1777252283.0551784, + "user_text": "", + "robot_text": "", + "user_wav": "/home/unitree/Marcus/Data/Voice/Recordings/gemini_turns/20260427_091123_user.wav", + "user_duration_sec": 2.24 + } + ], + "total_records": 95 +} \ No newline at end of file diff --git a/Voice/_probe_live.py b/Voice/_probe_live.py new file mode 100644 index 0000000..a54094a --- /dev/null +++ b/Voice/_probe_live.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""Probe — test which Gemini Live configurations actually connect. + +Run on the Jetson under the gemini_sdk env: + + /home/unitree/miniconda3/envs/gemini_sdk/bin/python \ + /home/unitree/Marcus/Voice/_probe_live.py + +Tries 6 model+config combos and reports which CONNECT successfully. +We don't send audio — just open the Live session and immediately close. +A successful connect == Google's server accepted our config; an error +means we either sent something invalid or used an unsupported model. +""" +import asyncio +import os +import sys + +API_KEY = ( + os.environ.get("MARCUS_GEMINI_API_KEY") + or os.environ.get("SANAD_GEMINI_API_KEY") + or "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8" +) + +try: + from google import genai + from google.genai import types +except ImportError: + print("google-genai not installed", file=sys.stderr) + sys.exit(2) + + +client = genai.Client(api_key=API_KEY) + + +async def try_connect(label: str, model: str, config) -> str: + try: + async with client.aio.live.connect(model=model, config=config): + return "OK" + except Exception as e: + msg = str(e) + if len(msg) > 140: + msg = msg[:140] + "..." + return "FAIL: " + msg + + +async def main(): + cases = [] + + # 1. 3.1 Live preview, TEXT, full Marcus config + cases.append(( + "3.1-live + TEXT + VAD + input_transcription + system_instruction", + "gemini-3.1-flash-live-preview", + types.LiveConnectConfig( + response_modalities=["TEXT"], + realtime_input_config=types.RealtimeInputConfig( + automatic_activity_detection=types.AutomaticActivityDetection( + disabled=False, + ), + ), + input_audio_transcription=types.AudioTranscriptionConfig(), + system_instruction=types.Content( + parts=[types.Part(text="Transcribe what the user says. Stay silent.")], + ), + ), + )) + + # 2. 3.1 Live preview, TEXT, BARE config (no extras) + cases.append(( + "3.1-live + TEXT + bare", + "gemini-3.1-flash-live-preview", + types.LiveConnectConfig(response_modalities=["TEXT"]), + )) + + # 3. 3.1 Live preview, AUDIO mode (full S2S, no transcription extras) + cases.append(( + "3.1-live + AUDIO + bare", + "gemini-3.1-flash-live-preview", + types.LiveConnectConfig(response_modalities=["AUDIO"]), + )) + + # 4. Sanad's native-audio model, AUDIO mode (known-working in Sanad) + cases.append(( + "native-audio-12-2025 + AUDIO + bare (Sanad's combo)", + "gemini-2.5-flash-native-audio-preview-12-2025", + types.LiveConnectConfig(response_modalities=["AUDIO"]), + )) + + # 5. Sanad's model + AUDIO + transcription extras (what Sanad actually sends) + cases.append(( + "native-audio-12-2025 + AUDIO + input/output transcription", + "gemini-2.5-flash-native-audio-preview-12-2025", + types.LiveConnectConfig( + response_modalities=["AUDIO"], + input_audio_transcription=types.AudioTranscriptionConfig(), + output_audio_transcription=types.AudioTranscriptionConfig(), + ), + )) + + # 6. native-audio + AUDIO + the input_transcription the runner adds + cases.append(( + "native-audio-09-2025 + AUDIO + bare", + "gemini-2.5-flash-native-audio-preview-09-2025", + types.LiveConnectConfig(response_modalities=["AUDIO"]), + )) + + print(f"{'CASE':<70} RESULT") + print("-" * 110) + for label, model, cfg in cases: + res = await try_connect(label, model, cfg) + marker = "✓" if res == "OK" else "✗" + print(f" {marker} {label:<66} {res}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/Voice/_probe_models.py b/Voice/_probe_models.py new file mode 100644 index 0000000..1915ce3 --- /dev/null +++ b/Voice/_probe_models.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +"""Probe — list Gemini models accessible to this API key. + +Run on the Jetson under the gemini_sdk env: + + /home/unitree/miniconda3/envs/gemini_sdk/bin/python \ + /home/unitree/Marcus/Voice/_probe_models.py + +Prints every model the key can see plus whether it supports +`bidiGenerateContent` (= the Live API). Pick one with that capability +and put its name in Config/config_Voice.json::stt.gemini_model. +""" +import os +import sys + +API_KEY = ( + os.environ.get("MARCUS_GEMINI_API_KEY") + or os.environ.get("SANAD_GEMINI_API_KEY") + or "AIzaSyDt9Xi83MDZuuPpfwfHyMD92X7ZKdGkqf8" +) + +try: + from google import genai +except ImportError: + print("google-genai not installed in this env", file=sys.stderr) + sys.exit(2) + +client = genai.Client(api_key=API_KEY) + +print(f"{'NAME':<60} {'METHODS'}") +print("-" * 100) +live_models = [] +for m in client.models.list(): + name = getattr(m, "name", "?") + methods = ( + getattr(m, "supported_actions", None) + or getattr(m, "supported_generation_methods", None) + or [] + ) + methods_str = ", ".join(methods) if methods else "-" + print(f"{name:<60} {methods_str}") + if "bidiGenerateContent" in methods: + live_models.append(name) + +print() +print("=" * 100) +print(f"Live API capable models (supports bidiGenerateContent):") +for name in live_models: + print(f" - {name}") +if not live_models: + print(" (none — your key may need Gemini Live access enabled)") diff --git a/Voice/gemini_runner.py b/Voice/gemini_runner.py index cea756e..a408a75 100644 --- a/Voice/gemini_runner.py +++ b/Voice/gemini_runner.py @@ -1,51 +1,55 @@ #!/usr/bin/env python3 -"""Voice/gemini_runner.py — Gemini Live STT subprocess. +"""Voice/gemini_runner.py — Gemini Live S2S subprocess (Option 3). -Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so it -can import `google-genai`, which doesn't support Python 3.8. The marcus env -itself is pinned to Python 3.8 by the NVIDIA Jetson torch wheel, so Gemini -has to live in its own process — the same pattern Sanad uses. +Runs in a Python 3.10+ env (the `gemini_sdk` conda env on this Jetson) so +it can import `google-genai` (which doesn't support Python 3.8). The +marcus env itself is pinned to Python 3.8 by the NVIDIA Jetson torch +wheel, so Gemini has to live in its own process. -The marcus parent process spawns this script via: +This is the full Sanad-pattern speech-to-speech variant: + - response_modalities=["AUDIO"] → Gemini speaks back through G1 speaker + - input_audio_transcription → user transcripts emitted on stdout for + Marcus's wake-word side channel + - output_audio_transcription → Gemini's reply text logged for review + - barge-in detection → user speaking over AI cuts AI off + - echo suppression → mic muted during AI playback + - JPEG camera frames over stdin → Marcus parent forwards frames; runner + streams them to Gemini Live so the + vision answers ("what do you see") + are correct, not hallucinated. - /path/to/gemini_sdk/python -u Voice/gemini_runner.py - -and parses the JSON-lines stream we emit on stdout. The parent never sees -audio bytes — this script owns the mic, the Gemini WebSocket, AND the WAV -recording, so the IPC boundary stays narrow (just transcripts). +The runner owns the G1 mic AND the G1 speaker (unitree_sdk2py works fine +in the gemini_sdk env on this Jetson — already verified). ──────────────────────────────────────────────────────────────────────── Stdout protocol (one JSON object per line, UTF-8): - {"type":"ready"} session connected, mic is live - {"type":"user", "text":"..."} user input transcription - {"type":"bot", "text":"..."} Gemini's text reply (logged only — never spoken) - {"type":"turn_end"} Gemini emitted turn_complete - {"type":"reconnect", "reason":"..."} session ended, will reconnect + {"type":"ready"} session connected + {"type":"user", "text":"..."} user input transcription + {"type":"bot", "text":"..."} Gemini's reply text (logged only — Gemini also speaks it) + {"type":"turn_end"} + {"type":"barge_in"} + {"type":"reconnect", "reason":"..."} {"type":"log", "level":"info|warn|error", "msg":"..."} Stdin protocol (line-based): - "stop\n" request graceful shutdown - -Exit codes: - 0 — clean shutdown after "stop" or signal - 2 — google-genai not importable - 3 — no API key - 4 — fatal session loop crash + "stop\n" graceful shutdown + "flush\n" drop mic buffer (echo prevention) + "frame:\n" forward a camera frame to Gemini Live + (Marcus parent throttles to ~2 fps) ──────────────────────────────────────────────────────────────────────── Env vars: - MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY) — required - MARCUS_GEMINI_MODEL (optional) — model id - MARCUS_GEMINI_VOICE (optional, ignored in TEXT mode) - MARCUS_PROJECT_ROOT (optional) — for sys.path - -This file uses Python 3.10+ syntax — type unions with `|`, etc. — because -the gemini_sdk env is 3.10+. DO NOT try to import it from marcus 3.8. + MARCUS_GEMINI_API_KEY (or SANAD_GEMINI_API_KEY) + MARCUS_GEMINI_MODEL (optional) + MARCUS_GEMINI_VOICE (optional) + MARCUS_PROJECT_ROOT (optional) """ from __future__ import annotations +import array import asyncio +import base64 import json import os import signal @@ -56,8 +60,6 @@ from typing import Any import numpy as np -# Make the Marcus project importable so we can reuse Voice/audio_io.py and -# Voice/turn_recorder.py (both pure-stdlib + numpy, no Python-version traps). _PROJECT_ROOT = ( os.environ.get("MARCUS_PROJECT_ROOT") or os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -65,7 +67,7 @@ _PROJECT_ROOT = ( if _PROJECT_ROOT not in sys.path: sys.path.insert(0, _PROJECT_ROOT) -from Voice.audio_io import BuiltinMic +from Voice.audio_io import BuiltinMic, BuiltinSpeaker from Voice.turn_recorder import TurnRecorder try: @@ -75,15 +77,15 @@ except Exception: _VCFG = {} _STT = _VCFG.get("stt", {}) +_SPK = _VCFG.get("speaker", {}) -# ─── stdout / stderr helpers ────────────────────────────────────── +# ─── stdout / stderr helpers ───────────────────────────────────── _stdout_lock = threading.Lock() def emit(payload: dict) -> None: - """Write one JSON line to stdout. Thread-safe + flushed.""" line = json.dumps(payload, ensure_ascii=False, separators=(",", ":")) with _stdout_lock: sys.stdout.write(line + "\n") @@ -91,34 +93,49 @@ def emit(payload: dict) -> None: def log(level: str, msg: str) -> None: - """Send a log line to the parent (parent forwards to logs/voice.log).""" emit({"type": "log", "level": level, "msg": msg}) -# ─── stdin watcher (graceful shutdown) ──────────────────────────── +# ─── stdin watcher (graceful shutdown + flush + frames) ────────── _STOP_REQUESTED = threading.Event() -_MIC_HOLDER: list = [] # length-≤1 list — holds the active BuiltinMic +_MIC_HOLDER: list = [] # [BuiltinMic] when active + +# Latest JPEG frame from the parent (raw bytes). The frame-send loop +# picks this up and ships it to Gemini Live. +_LATEST_FRAME_LOCK = threading.Lock() +_LATEST_FRAME: dict = {"bytes": None, "ts": 0.0} def _stdin_watcher() -> None: try: for line in sys.stdin: - cmd = line.strip().lower() + line = line.rstrip("\n") + if not line: + continue + # Match the cheap commands first, then fall through to frame: + cmd = line.lower() if cmd == "stop": log("info", "stop received from parent — exiting") _STOP_REQUESTED.set() return elif cmd == "flush": - # Parent asks us to drop buffered mic audio (e.g. before - # TtsMaker plays a reply, so the robot's own voice doesn't - # come back as a fake user utterance). if _MIC_HOLDER: try: _MIC_HOLDER[0].flush() except Exception: pass + elif line.startswith("frame:"): + b64 = line[len("frame:"):] + try: + data = base64.b64decode(b64) + except Exception: + continue + if data: + with _LATEST_FRAME_LOCK: + _LATEST_FRAME["bytes"] = data + _LATEST_FRAME["ts"] = time.time() except Exception: return @@ -137,17 +154,16 @@ def _install_signal_handlers() -> None: pass -# ─── tunables (mirrors Voice/gemini_script.py reads) ────────────── +# ─── tunables ──────────────────────────────────────────────────── _MODEL = os.environ.get( "MARCUS_GEMINI_MODEL", _STT.get("gemini_model", "gemini-2.5-flash-native-audio-preview-12-2025"), ) -_DEFAULT_VOICE = os.environ.get( +_VOICE = os.environ.get( "MARCUS_GEMINI_VOICE", _STT.get("gemini_voice_name", "Charon"), ) - _API_KEY = ( os.environ.get("MARCUS_GEMINI_API_KEY") or os.environ.get("SANAD_GEMINI_API_KEY") @@ -161,20 +177,33 @@ _MAX_CONSECUTIVE_ERRORS = int(_STT.get("gemini_max_consecutive_errors", 10)) _NO_MESSAGES_TIMEOUT = float(_STT.get("gemini_no_messages_timeout_sec", 30)) SEND_SAMPLE_RATE = int(_STT.get("gemini_send_sample_rate", 16000)) +RECEIVE_SAMPLE_RATE = int(_STT.get("gemini_receive_sample_rate", 24000)) CHUNK_SIZE = int(_STT.get("gemini_chunk_size", 512)) _CHUNK_BYTES = CHUNK_SIZE * 2 +_SILENCE_PCM = b"\x00" * _CHUNK_BYTES + +_BARGE_THRESHOLD = int(_STT.get("gemini_barge_threshold", 500)) +_BARGE_CHUNKS = int(_STT.get("gemini_barge_loud_chunks_needed", 3)) +_BARGE_COOLDOWN = float(_STT.get("gemini_barge_cooldown_sec", 0.3)) +_ECHO_SUPPRESS_BELOW = int(_STT.get("gemini_echo_suppress_below", 500)) +_AI_GRACE = float(_STT.get("gemini_ai_speak_grace_sec", 0.15)) + +_FRAME_INTERVAL = float(_STT.get("gemini_frame_interval_sec", 0.5)) +_FRAME_MAX_AGE = float(_STT.get("gemini_frame_max_age_sec", 1.5)) _REC_ENABLED = bool(_STT.get("gemini_record_enabled", True)) -_RECV_RATE = int(_STT.get("gemini_receive_sample_rate", 24000)) _DATA_DIR = os.path.join( _PROJECT_ROOT, _VCFG.get("audio", {}).get("data_dir", "Data/Voice/Recordings"), "gemini_turns", ) -_SYS_PROMPT = _STT.get( - "gemini_system_prompt", - "Transcribe what the user says to Sanad. Stay silent.", +_SYS_PROMPT = ( + os.environ.get("MARCUS_GEMINI_SYSTEM_PROMPT") + or _STT.get( + "gemini_system_prompt", + "You are Sanad, a humanoid robot assistant. Reply briefly.", + ) ) _SP_FILE = _STT.get("gemini_system_prompt_file", "") if _SP_FILE: @@ -191,7 +220,65 @@ if _SP_FILE: pass -# ─── main async loop ────────────────────────────────────────────── +def _audio_energy(pcm: bytes) -> int: + try: + samples = array.array("h", pcm) + return sum(abs(s) for s in samples) // len(samples) if samples else 0 + except Exception: + return 0 + + +# ─── G1 speaker (audio playback) — initialized in main_async ──── + + +def _init_g1_speaker() -> BuiltinSpeaker | None: + """Initialise the G1 DDS audio client and wrap it in a BuiltinSpeaker. + + Sanad's audio_io.BuiltinSpeaker normally takes an already-initialised + AudioClient. This subprocess owns its own DDS init. + """ + try: + from unitree_sdk2py.core.channel import ChannelFactoryInitialize + from unitree_sdk2py.g1.audio.g1_audio_client import AudioClient + except Exception as e: + log("error", f"unitree_sdk2py not importable in this env: {e}") + return None + + iface = _SPK.get("dds_interface", "eth0") + try: + ChannelFactoryInitialize(0, iface) + except Exception as e: + # Already initialised in this process — that's fine. + log("warn", f"ChannelFactoryInitialize: {e}") + + try: + ac = AudioClient() + ac.SetTimeout(10.0) + ac.Init() + try: + ac.SetVolume(int(_SPK.get("volume", 100))) + except Exception: + pass + except Exception as e: + log("error", f"AudioClient init failed: {e}") + return None + + return BuiltinSpeaker(ac, app_name=_SPK.get("app_name", "sanad")) + + +# ─── per-session state (reset on each connect) ─────────────────── + + +class _Session: + def __init__(self): + self.speaking = False + self.stream_started = False + self.barge_block_until = 0.0 + self.ai_speak_start = 0.0 + self.last_ai_audio = 0.0 + + +# ─── main async loop ───────────────────────────────────────────── def _build_config(types): @@ -201,7 +288,14 @@ def _build_config(types): silence_ms = int(_STT.get("gemini_vad_silence_duration_ms", 200)) return types.LiveConnectConfig( - response_modalities=["TEXT"], + response_modalities=["AUDIO"], + speech_config=types.SpeechConfig( + voice_config=types.VoiceConfig( + prebuilt_voice_config=types.PrebuiltVoiceConfig( + voice_name=_VOICE, + ), + ), + ), realtime_input_config=types.RealtimeInputConfig( automatic_activity_detection=types.AutomaticActivityDetection( disabled=False, @@ -212,15 +306,17 @@ def _build_config(types): ), ), input_audio_transcription=types.AudioTranscriptionConfig(), + output_audio_transcription=types.AudioTranscriptionConfig(), system_instruction=types.Content( parts=[types.Part(text=_SYS_PROMPT)], ), ) -async def _send_mic_loop(session, types_mod, mic, recorder, done: asyncio.Event) -> None: +async def _send_mic_loop(session, types_mod, mic, speaker, recorder, sess: _Session, done: asyncio.Event) -> None: loop = asyncio.get_event_loop() frame_pause = CHUNK_SIZE / float(SEND_SAMPLE_RATE) + loud_count = 0 last_activity = time.time() while not done.is_set() and not _STOP_REQUESTED.is_set(): @@ -239,25 +335,59 @@ async def _send_mic_loop(session, types_mod, mic, recorder, done: asyncio.Event) samples = np.clip(samples * _MIC_GAIN, -32768, 32767).astype(np.int16) raw = samples.tobytes() - # Per-turn user-audio capture for the WAV recorder. We don't have - # Gemini's "is the AI speaking" flag (no audio out), so capture - # whenever we have meaningful energy. - try: - samples_view = np.frombuffer(raw, dtype=np.int16) - if samples_view.size and int(np.abs(samples_view).max()) > 250: - recorder.capture_user(raw) - except Exception: - pass - + energy = _audio_energy(raw) now = time.time() - if now - last_activity > 10: - log("info", f"alive (idle {now - last_activity:.0f}s)") + + # Barge-in detection — sustained user energy interrupts the AI. + if sess.speaking and now >= sess.barge_block_until: + if (now - sess.ai_speak_start) >= _AI_GRACE: + if energy > _BARGE_THRESHOLD: + loud_count += 1 + else: + loud_count = max(0, loud_count - 1) + if loud_count > _BARGE_CHUNKS: + log("info", f"BARGE-IN (e={energy})") + emit({"type": "barge_in"}) + sess.speaking = False + sess.stream_started = False + try: + speaker.stop() + except Exception: + pass + try: + mic.flush() + except Exception: + pass + try: + recorder.finish_turn() + except Exception: + pass + loud_count = 0 + sess.barge_block_until = now + _BARGE_COOLDOWN + + # Echo suppression — while AI speaks, mute quiet mic frames so the + # mic doesn't feed Gemini its own voice. + send_data = raw + if sess.speaking and energy < _ECHO_SUPPRESS_BELOW: + send_data = _SILENCE_PCM + + # Capture user audio for the per-turn WAV (only when user actually speaks). + if energy > 250 and not sess.speaking: + try: + recorder.capture_user(raw) + except Exception: + pass + + if energy > 250: + last_activity = now + elif now - last_activity > 10: + log("info", f"alive (no speech {now - last_activity:.0f}s)") last_activity = now try: await session.send_realtime_input( audio=types_mod.Blob( - data=raw, + data=send_data, mime_type=f"audio/pcm;rate={SEND_SAMPLE_RATE}", ), ) @@ -271,7 +401,31 @@ async def _send_mic_loop(session, types_mod, mic, recorder, done: asyncio.Event) await asyncio.sleep(frame_pause) -async def _receive_loop(session, recorder, done: asyncio.Event) -> None: +async def _send_frame_loop(session, types_mod, done: asyncio.Event) -> None: + """Periodically push the latest cached camera frame (JPEG) to Gemini Live.""" + while not done.is_set() and not _STOP_REQUESTED.is_set(): + await asyncio.sleep(_FRAME_INTERVAL) + with _LATEST_FRAME_LOCK: + data = _LATEST_FRAME.get("bytes") + ts = _LATEST_FRAME.get("ts", 0.0) + if not data: + continue + if (time.time() - ts) > _FRAME_MAX_AGE: + # Stale — don't waste tokens streaming a frame the camera abandoned. + continue + try: + await session.send_realtime_input( + video=types_mod.Blob(data=data, mime_type="image/jpeg"), + ) + except asyncio.CancelledError: + return + except Exception as e: + log("warn", f"frame send failed: {e}") + # Keep going — frames are best-effort. + + +async def _receive_loop(session, speaker, recorder, sess: _Session, done: asyncio.Event) -> None: + loop = asyncio.get_event_loop() last_recv = time.time() try: while not done.is_set() and not _STOP_REQUESTED.is_set(): @@ -290,30 +444,78 @@ async def _receive_loop(session, recorder, done: asyncio.Event) -> None: if sc is None: continue + if getattr(sc, "interrupted", False) is True: + if sess.speaking: + log("info", "Gemini interrupted by server") + sess.speaking = False + sess.stream_started = False + try: + speaker.stop() + except Exception: + pass + try: + recorder.finish_turn() + except Exception: + pass + continue + it = getattr(sc, "input_transcription", None) if it is not None: text = (getattr(it, "text", "") or "").strip() - if text: + if text and not sess.speaking: emit({"type": "user", "text": text}) try: recorder.add_user_text(text) except Exception: pass + ot = getattr(sc, "output_transcription", None) + if ot is not None: + text = (getattr(ot, "text", "") or "").strip() + if text: + emit({"type": "bot", "text": text}) + try: + recorder.add_robot_text(text) + except Exception: + pass + mt = getattr(sc, "model_turn", None) if mt is not None: for part in getattr(mt, "parts", []) or []: - txt = getattr(part, "text", None) - if txt: - txt = txt.strip() - if txt: - emit({"type": "bot", "text": txt}) - try: - recorder.add_robot_text(txt) - except Exception: - pass + inl = getattr(part, "inline_data", None) + if inl is not None and getattr(inl, "data", None): + now = time.time() + if not sess.speaking: + sess.ai_speak_start = now + sess.speaking = True + sess.last_ai_audio = now + raw_audio = inl.data + try: + recorder.capture_robot(raw_audio) + except Exception: + pass + audio_arr = np.frombuffer(raw_audio, dtype=np.int16) + if not sess.stream_started: + await loop.run_in_executor(None, speaker.begin_stream) + sess.stream_started = True + await loop.run_in_executor( + None, speaker.send_chunk, + audio_arr, RECEIVE_SAMPLE_RATE, + ) if getattr(sc, "turn_complete", False): + if sess.speaking and sess.stream_started and not speaker.interrupted: + log("info", f"speaker {speaker.total_sent_sec:.1f}s") + await loop.run_in_executor(None, speaker.wait_finish) + elif sess.speaking and speaker.interrupted: + log("info", "speaker interrupted") + sess.speaking = False + sess.stream_started = False + try: + # Drop any echo of the just-played reply. + _MIC_HOLDER and _MIC_HOLDER[0].flush() + except Exception: + pass try: recorder.finish_turn() except Exception: @@ -351,21 +553,30 @@ async def main_async() -> int: return 4 config = _build_config(types) + mic = BuiltinMic() mic.start() - _MIC_HOLDER.append(mic) # expose to the stdin "flush" watcher + _MIC_HOLDER.append(mic) + + speaker = _init_g1_speaker() + if speaker is None: + log("error", "G1 speaker not available — exiting") + try: + mic.stop() + except Exception: + pass + return 5 recorder = TurnRecorder( enabled=_REC_ENABLED, out_dir=_DATA_DIR, user_rate=SEND_SAMPLE_RATE, - robot_rate=_RECV_RATE, + robot_rate=RECEIVE_SAMPLE_RATE, ) session_num = 0 consecutive_errors = 0 start = time.time() - rc = 0 try: while not _STOP_REQUESTED.is_set(): @@ -377,12 +588,14 @@ async def main_async() -> int: emit({"type": "ready"}) consecutive_errors = 0 mic.flush() + sess = _Session() done = asyncio.Event() try: await asyncio.wait_for( asyncio.gather( - _send_mic_loop(session, types, mic, recorder, done), - _receive_loop(session, recorder, done), + _send_mic_loop(session, types, mic, speaker, recorder, sess, done), + _send_frame_loop(session, types, done), + _receive_loop(session, speaker, recorder, sess, done), ), timeout=_SESSION_TIMEOUT, ) @@ -392,6 +605,10 @@ async def main_async() -> int: pass log("info", f"session #{session_num} ended — reconnecting in 1s") + try: + speaker.stop() + except Exception: + pass try: mic.flush() except Exception: @@ -421,8 +638,12 @@ async def main_async() -> int: mic.stop() except Exception: pass + try: + speaker.stop() + except Exception: + pass - return rc + return 0 def main() -> int: diff --git a/Voice/gemini_script.py b/Voice/gemini_script.py index e30c5e2..8766b52 100644 --- a/Voice/gemini_script.py +++ b/Voice/gemini_script.py @@ -26,13 +26,14 @@ Subprocess lookup order for the Python 3.10+ binary: from __future__ import annotations +import base64 import json import logging import os import subprocess import sys import threading -from typing import Callable, Optional +from typing import Callable, Optional, Union log = logging.getLogger("gemini_brain") @@ -102,17 +103,22 @@ class GeminiBrain: api_key: str = "", on_transcript: Optional[Callable[[str], None]] = None, on_command: Optional[Callable[[str, str], None]] = None, + on_bot_text: Optional[Callable[[str], None]] = None, + on_turn_end: Optional[Callable[[], None]] = None, ): self._voice_name = voice_name or "" self._system_prompt = system_prompt or "" self._api_key = api_key self._on_transcript = on_transcript self._on_command = on_command + self._on_bot_text = on_bot_text + self._on_turn_end = on_turn_end self._proc = None # type: Optional[subprocess.Popen] self._reader_thread = None # type: Optional[threading.Thread] self._err_thread = None # type: Optional[threading.Thread] self._stopping = False + self._stdin_lock = threading.Lock() # serialise stdin writes # config-loaded lazily so import order doesn't matter try: @@ -181,21 +187,40 @@ class GeminiBrain: self._err_thread.start() def flush_mic(self) -> None: - """ - Tell the runner subprocess to drop its buffered mic audio. + """Tell the runner to drop its mic buffer (echo prevention).""" + self._send_stdin("flush\n") - Used before AND after the brain speaks via TtsMaker so the robot's - own voice (which the mic picks up during playback) doesn't come - back from Gemini as a fake user utterance and accidentally hit - the wake-word gate. + def send_frame(self, jpeg: Union[bytes, str]) -> None: + """ + Forward a single camera frame (JPEG) to the runner so it can stream + it to Gemini Live. The runner throttles + de-stales internally. + + `jpeg` may be raw bytes (e.g. from cv2.imencode) OR an already- + base64 ASCII string (e.g. from API.camera_api.get_frame()). """ proc = self._proc + if proc is None or proc.stdin is None: + return + if isinstance(jpeg, bytes): + b64 = base64.b64encode(jpeg).decode("ascii") + elif isinstance(jpeg, str): + b64 = jpeg.strip() + else: + return + if not b64: + return + self._send_stdin("frame:" + b64 + "\n") + + def _send_stdin(self, line: str) -> None: + """Serialised stdin write — multiple threads (frame sender + flush) can call safely.""" + proc = self._proc if proc is None or proc.stdin is None: return try: - if not proc.stdin.closed: - proc.stdin.write("flush\n") - proc.stdin.flush() + with self._stdin_lock: + if not proc.stdin.closed: + proc.stdin.write(line) + proc.stdin.flush() except Exception: pass @@ -205,15 +230,7 @@ class GeminiBrain: if proc is None: return # Polite stop: send "stop\n" on stdin, then wait briefly, then SIGTERM. - try: - if proc.stdin and not proc.stdin.closed: - try: - proc.stdin.write("stop\n") - proc.stdin.flush() - except Exception: - pass - except Exception: - pass + self._send_stdin("stop\n") try: proc.wait(timeout=3) except Exception: @@ -280,8 +297,18 @@ class GeminiBrain: txt = (msg.get("text") or "").strip() if txt: log.info("GEMINI: %s", txt[:120]) + if self._on_bot_text is not None: + try: + self._on_bot_text(txt) + except Exception as e: + log.error("on_bot_text failed: %s", e) elif t == "turn_end": log.info("listening") + if self._on_turn_end is not None: + try: + self._on_turn_end() + except Exception as e: + log.error("on_turn_end failed: %s", e) elif t == "ready": log.info("connected — listening for speech") elif t == "reconnect": diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py index 2a07953..8e10a68 100644 --- a/Voice/marcus_voice.py +++ b/Voice/marcus_voice.py @@ -86,10 +86,112 @@ def _log_transcript(action: str, text: str) -> None: _transcript_log.info("%-5s %s", action, (text or "").strip()) -# Module-level vocabulary — populated from Config/config_Voice.json::stt. -# Used by the wake-word gate and the fuzzy-match command normalizer. -WAKE_WORDS: set = set() -COMMAND_VOCAB: list = [] +# ─── instruction.json — bilingual phrase tables ────────────────── +# +# Single source of truth for every voice phrase the dispatch layer cares +# about: wake-word variants (EN + AR), per-action user_phrases (what the +# user might say), per-action bot_phrases (what Gemini might say back). +# Loaded ONCE at module import; rebuilds the runtime tables below. +# Adding a new accent / variant / action is a JSON-only edit — no Python +# change required. +import json as _json + + +def _load_instructions() -> dict: + path = os.path.join(PROJECT_ROOT, "Config", "instruction.json") + try: + with open(path, "r", encoding="utf-8") as f: + return _json.load(f) or {} + except Exception as e: + # Fail soft — empty tables mean the dispatch gate just rejects + # everything and is_running stays True. Better than a crash. + try: + log.error("instruction.json not loaded: %s", e) + except Exception: + pass + return {} + + +_INSTRUCTIONS = _load_instructions() + + +def _build_wake_words(data: dict) -> set: + out = set() + wake = data.get("wake_words", {}) or {} + for lang in ("english", "arabic"): + for w in wake.get(lang, []) or []: + if isinstance(w, str) and w.strip(): + out.add(w.strip().lower()) + return out + + +def _build_command_vocab(data: dict) -> list: + """English-only canonical phrases — used by the difflib fuzzy matcher. + Includes every action's `canonical` plus all entries from + `user_phrases.english` (deduped, original order preserved per action).""" + seen = set() + out = [] + for action in (data.get("actions", {}) or {}).values(): + canon = (action.get("canonical") or "").strip() + if canon and canon not in seen: + seen.add(canon) + out.append(canon) + for p in (action.get("user_phrases", {}) or {}).get("english", []) or []: + p = (p or "").strip() + if p and p not in seen: + seen.add(p) + out.append(p) + return out + + +def _build_arabic_motion_map(data: dict) -> dict: + """Map Arabic user phrase → English canonical for every action.""" + out = {} + for action in (data.get("actions", {}) or {}).values(): + canon = (action.get("canonical") or "").strip() + if not canon: + continue + for p in (action.get("user_phrases", {}) or {}).get("arabic", []) or []: + p = (p or "").strip() + if p: + out[p] = canon + return out + + +def _build_bot_motion_patterns(data: dict) -> list: + """List of (needle, canonical) the bot dispatcher matches Gemini's + spoken reply against. English needles are lowercased so the dispatcher + can use case-insensitive `in` checks; Arabic needles are kept verbatim.""" + out = [] + for action in (data.get("actions", {}) or {}).values(): + canon = (action.get("canonical") or "").strip() + if not canon: + continue + bot = action.get("bot_phrases", {}) or {} + for p in bot.get("english", []) or []: + p = (p or "").strip() + if p: + out.append((p.lower(), canon)) + for p in bot.get("arabic", []) or []: + p = (p or "").strip() + if p: + out.append((p, canon)) + # Sort by needle length descending so multi-word phrases match before + # their shorter prefixes (e.g. "moving forward" before "moving"). + out.sort(key=lambda x: len(x[0]), reverse=True) + return out + + +# Module-level vocabulary tables, all derived from instruction.json. +# Mutable (rebuildable) — VoiceModule.__init__ re-reads in case the file +# changed since import. +WAKE_WORDS: set = _build_wake_words(_INSTRUCTIONS) +COMMAND_VOCAB: list = _build_command_vocab(_INSTRUCTIONS) +_ARABIC_MOTION_TO_CANONICAL: dict = _build_arabic_motion_map(_INSTRUCTIONS) +_BOT_MOTION_PATTERNS: list = _build_bot_motion_patterns(_INSTRUCTIONS) + +# Garbage patterns + min length stay in config_Voice.json (they're +# noise filtering, not voice instructions). GARBAGE_PATTERNS: set = set() _MIN_TRANSCRIPTION_LENGTH: int = 3 @@ -139,6 +241,20 @@ def _strip_wake_word(text: str) -> str: return text +def _translate_arabic_motion(text: str) -> str: + """Translate Arabic motion phrases to English canonical equivalents, + using the table built from instruction.json::actions[*].user_phrases.arabic. + Substring match; longest-needle wins so multi-word phrases match before + their shorter prefixes.""" + s = text.strip() + if not s or not _ARABIC_MOTION_TO_CANONICAL: + return text + for ar in sorted(_ARABIC_MOTION_TO_CANONICAL.keys(), key=len, reverse=True): + if ar in s: + return _ARABIC_MOTION_TO_CANONICAL[ar] + return text + + def _closest_command(text: str, cutoff: float = 0.72) -> str: """ Map a transcription to the closest known command phrase. @@ -183,16 +299,31 @@ class VoiceModule: self._stt = self._config.get("stt", {}) self._messages = self._config.get("messages", {}) - # Load vocab from config — single source of truth. - global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH - WAKE_WORDS = {w.lower() for w in self._stt.get("wake_words", [])} - COMMAND_VOCAB = list(self._stt.get("command_vocab", [])) + # Reload instruction.json so a hot-edit between runs is picked + # up without re-importing the module. All four phrase tables + # (wake_words, command_vocab, Arabic→canonical map, bot motion + # patterns) are rebuilt from instruction.json — single source of + # truth. Garbage_patterns + min_transcription_length stay in + # config_Voice.json (those are noise filtering, not voice + # instruction tables). + global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, \ + _MIN_TRANSCRIPTION_LENGTH, _ARABIC_MOTION_TO_CANONICAL, \ + _BOT_MOTION_PATTERNS, _INSTRUCTIONS + _INSTRUCTIONS = _load_instructions() + WAKE_WORDS = _build_wake_words(_INSTRUCTIONS) + COMMAND_VOCAB = _build_command_vocab(_INSTRUCTIONS) + _ARABIC_MOTION_TO_CANONICAL = _build_arabic_motion_map(_INSTRUCTIONS) + _BOT_MOTION_PATTERNS = _build_bot_motion_patterns(_INSTRUCTIONS) GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])} _MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3)) self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72)) log.info( - "vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns", - len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS), + "instruction.json loaded: %d wake_words, %d command_vocab, " + "%d arabic→canonical, %d bot patterns; " + "+ %d garbage_patterns from config_Voice.json", + len(WAKE_WORDS), len(COMMAND_VOCAB), + len(_ARABIC_MOTION_TO_CANONICAL), len(_BOT_MOTION_PATTERNS), + len(GARBAGE_PATTERNS), ) # Dispatch dedup state: Gemini's input_transcription can fire @@ -201,10 +332,26 @@ class VoiceModule: self._last_gemini_canon = "" self._last_gemini_dispatch_at = 0.0 + # Wake-word gate state. Motion (whether triggered from the user's + # transcript OR from Gemini's verbal confirmation) only fires + # within a turn whose user transcript contained "Sanad" / "سند" + # at least once. Reset on turn_end. This is the safety rule: + # actions ALWAYS require the wake word, in either language. + self._wake_active_for_turn = False + # Gemini brain reference for flush_mic() — populated by # _voice_loop_gemini after spawning the runner subprocess. self._brain = None + # Per-turn buffer for Gemini's spoken text. Gemini Live emits the + # output transcription in many small chunks ("I", "see", "a", ...); + # we accumulate them and print one clean `[Sanad] said: "..."` + # line per turn. Flushed on turn_end OR when a chunk ends with + # sentence-ending punctuation. + self._gemini_say_buf = [] + self._gemini_say_lock = threading.Lock() + self._gemini_say_last_chunk_at = 0.0 + self._running = False self._thread = None @@ -277,7 +424,7 @@ class VoiceModule: ) log.info( - "Voice loop started — GEMINI STT subprocess " + "Voice loop started — GEMINI S2S subprocess " "(model=%s, voice=%s)", model, voice_name, ) @@ -288,14 +435,60 @@ class VoiceModule: api_key=api_key, on_transcript=self._on_gemini_transcript, on_command=self._dispatch_gemini_command, + on_bot_text=self._on_gemini_say_chunk, + on_turn_end=self._on_gemini_turn_end, ) self._brain = brain brain.start() + # ── Camera-frame sender ──────────────────────────────────── + # Stream JPEG frames to the runner so Gemini Live can SEE what + # the robot sees. Without this, "what do you see" / "describe + # this exhibit" answers would be hallucinations. The runner + # forwards them to Gemini as image/jpeg blobs and de-stales + # anything older than gemini_frame_max_age_sec. + send_frames = bool(self._stt.get("gemini_send_frames", True)) + frame_interval = float(self._stt.get("gemini_frame_interval_sec", 0.5)) + frame_thread = None + frame_stop = threading.Event() + if send_frames: + try: + from API.camera_api import get_frame as _camera_get_frame + except Exception as e: + log.warning("camera_api unavailable — frame streaming disabled: %s", e) + _camera_get_frame = None + if _camera_get_frame is not None: + def _frame_sender_loop(): + log.info( + "frame sender started — %.2fs interval, " + "streaming camera frames to Gemini Live", + frame_interval, + ) + while not frame_stop.is_set() and self._running: + try: + frame_b64 = _camera_get_frame() + if frame_b64: + # camera_api returns a base64 ASCII string — + # GeminiBrain.send_frame accepts that directly. + brain.send_frame(frame_b64) + except Exception as e: + log.debug("frame send failed: %s", e) + frame_stop.wait(frame_interval) + log.info("frame sender stopped") + + frame_thread = threading.Thread( + target=_frame_sender_loop, + daemon=True, name="gemini-frames", + ) + frame_thread.start() + try: while self._running: time.sleep(0.25) finally: + frame_stop.set() + if frame_thread is not None: + frame_thread.join(timeout=2) brain.stop() self._brain = None @@ -306,6 +499,52 @@ class VoiceModule: if text: _log_transcript("HEARD", text) + def _on_gemini_say_chunk(self, text: str) -> None: + """ + Receive a Gemini output-transcription chunk. Two side effects: + 1. Forward to the bot dispatcher so motion can fire on + confirmation phrases (Turning right / Sitting down / etc.). + 2. Buffer the chunk for the per-turn `[Sanad] said: ...` line + that prints once on turn_end (or sooner if the chunk ends + with sentence punctuation). + """ + # Motion side-channel — chunk-level so dispatch is fast. + try: + self._dispatch_gemini_bot(text) + except Exception: + pass + + with self._gemini_say_lock: + self._gemini_say_buf.append(text) + self._gemini_say_last_chunk_at = time.time() + # Flush early if this chunk closes a sentence — typical for + # short acks like "Turning right." that arrive as one chunk. + if text.rstrip().endswith((".", "!", "?")): + self._flush_gemini_say_locked() + + def _on_gemini_turn_end(self) -> None: + """Flush any pending Gemini output chunks at turn boundary, + and close the wake-word motion gate for the next turn.""" + with self._gemini_say_lock: + self._flush_gemini_say_locked() + self._wake_active_for_turn = False + + def _flush_gemini_say_locked(self) -> None: + """Caller MUST hold self._gemini_say_lock. Prints one [Sanad] said: line.""" + if not self._gemini_say_buf: + return + joined = " ".join(t.strip() for t in self._gemini_say_buf if t).strip() + while " " in joined: + joined = joined.replace(" ", " ") + self._gemini_say_buf = [] + if joined: + _log_transcript("SAID", joined) + try: + print(f' [Sanad] said: "{joined[:200]}"') + print("Command: ", end="", flush=True) + except Exception: + pass + def _dispatch_gemini_command(self, text: str, lang: str = "en") -> None: """ Fire self._on_command for any transcript prefixed with the wake @@ -331,10 +570,20 @@ class VoiceModule: if not text or not _has_wake_word(text): return + # Wake word found — open the motion gate for this turn so the + # bot-side dispatcher (Gemini's spoken confirmation) is also + # allowed to fire. Reset on turn_end. + self._wake_active_for_turn = True + stripped = _strip_wake_word(text) if not stripped or len(stripped.strip()) < _MIN_TRANSCRIPTION_LENGTH: return + # Bilingual support: translate Arabic motion phrases to their + # English canonical equivalent BEFORE fuzzy-matching. Marcus's + # command_parser is English-only. + stripped = _translate_arabic_motion(stripped) + low = stripped.lower().strip().rstrip(".!?,") vocab_exact = {c.lower() for c in COMMAND_VOCAB} if low in GARBAGE_PATTERNS and low not in vocab_exact: @@ -386,6 +635,61 @@ class VoiceModule: log.info("fuzzy-match: %r → %r", text, canonical) return canonical + # _BOT_MOTION_PATTERNS is built at module load from + # Config/instruction.json::actions[*].bot_phrases (both English and + # Arabic). The dispatcher reads it via the module-level reference. + @property + def _BOT_MOTION_PATTERNS(self): + return _BOT_MOTION_PATTERNS + + def _dispatch_gemini_bot(self, text: str) -> None: + """ + Dispatch motion when Gemini's spoken reply contains a known motion- + confirmation pattern (English or Arabic). Strict gate: only fires + if the current turn's user transcript already passed the wake-word + check (`_wake_active_for_turn`) — so motion ALWAYS requires "Sanad" + / "سند" somewhere in the user's request. Dedups against the + user-transcript path so the same command can't fire twice. + """ + if not text: + return + # SAFETY GATE: motion only when wake word was heard this turn. + if not self._wake_active_for_turn: + return + + low = text.strip().rstrip(".!?,").lower() + # Note: we keep both English (lowercased) and Arabic patterns; + # Arabic is unaffected by .lower() since it has no case. + if not low: + return + + canon = None + for needle, cmd in self._BOT_MOTION_PATTERNS: + # Match-anywhere so leading filler ("OK, " / "حسناً، ") doesn't + # block the recognition. English needles are lowercase; Arabic + # needles match as-is. + if needle in low or needle in text: + canon = cmd + break + if canon is None: + return + + now = time.time() + cooldown = float(self._stt.get("command_cooldown_sec", 1.5)) + if (canon == self._last_gemini_canon + and now - self._last_gemini_dispatch_at < cooldown): + return + self._last_gemini_canon = canon + self._last_gemini_dispatch_at = now + + log.info("dispatch (gemini-bot): %s (heard: %r)", canon, text[:80]) + _log_transcript("CMD-BOT", canon) + if self._on_command: + try: + self._on_command(canon, "en") + except Exception as e: + log.error("on_command error: %s", e, exc_info=True) + # ─── start / stop ───────────────────────────────────── def start(self): @@ -406,6 +710,12 @@ class VoiceModule: self._thread = None log.info("Voice module stopped") + @property + def is_running(self) -> bool: + """True while the voice loop thread is alive.""" + t = self._thread + return bool(self._running and t is not None and t.is_alive()) + @property def is_speaking(self) -> bool: """Delegates to AudioAPI — True while TtsMaker is playing."""