726 lines
28 KiB
Python
726 lines
28 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Voice/marcus_voice.py — Gemini Live voice orchestrator for Marcus.
|
||
|
||
Pipeline:
|
||
|
||
G1 mic UDP ──► BuiltinMic (Voice/audio_io.py)
|
||
│
|
||
▼
|
||
GeminiBrain (Voice/gemini/script.py)
|
||
│ audio out (24 kHz)
|
||
▼
|
||
BuiltinSpeaker (Voice/audio_io.py) ──► G1 speaker
|
||
│ user transcript (on_command)
|
||
▼
|
||
_dispatch_gemini_command
|
||
- require wake word "Sanad"
|
||
- fuzzy-match command_vocab
|
||
- dedup within command_cooldown_sec
|
||
│
|
||
▼
|
||
on_command(text, "en") ──► Marcus brain
|
||
|
||
Gemini owns both STT and TTS — it hears the user and replies with its own
|
||
voice. Marcus's on_command hook fires alongside Gemini's verbal reply so
|
||
motion commands (\"Sanad, turn right\") still move the robot body while
|
||
the conversation flows naturally.
|
||
|
||
Wake word is enforced at dispatch only — Gemini chats normally on all
|
||
speech; the robot moves only when \"Sanad\" + a recognised action phrase
|
||
appears in the transcript.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import os
|
||
import re
|
||
import sys
|
||
import threading
|
||
import time
|
||
from difflib import SequenceMatcher
|
||
from logging.handlers import RotatingFileHandler
|
||
from typing import Callable, Optional
|
||
|
||
import numpy as np
|
||
|
||
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
if _PROJECT_DIR not in sys.path:
|
||
sys.path.insert(0, _PROJECT_DIR)
|
||
from Core.env_loader import PROJECT_ROOT
|
||
from Core.config_loader import load_config
|
||
|
||
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
||
os.makedirs(LOG_DIR, exist_ok=True)
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||
handlers=[
|
||
RotatingFileHandler(
|
||
os.path.join(LOG_DIR, "voice.log"),
|
||
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
),
|
||
],
|
||
)
|
||
log = logging.getLogger("marcus_voice")
|
||
|
||
|
||
# ── Transcript log ─────────────────────────────────────────────
|
||
# Every user transcript Gemini emits is written here in a simple
|
||
# one-line-per-entry format. Rotates every 5 MB × 3 backups.
|
||
_TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log")
|
||
_transcript_log = logging.getLogger("transcript")
|
||
_transcript_log.setLevel(logging.INFO)
|
||
_transcript_log.propagate = False
|
||
if not _transcript_log.handlers:
|
||
_th = RotatingFileHandler(
|
||
_TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
)
|
||
_th.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
|
||
_transcript_log.addHandler(_th)
|
||
|
||
|
||
def _log_transcript(action: str, text: str) -> None:
|
||
_transcript_log.info("%-5s %s", action, (text or "").strip())
|
||
|
||
|
||
# ─── instruction.json — bilingual phrase tables ──────────────────
|
||
#
|
||
# Single source of truth for every voice phrase the dispatch layer cares
|
||
# about: wake-word variants (EN + AR), per-action user_phrases (what the
|
||
# user might say), per-action bot_phrases (what Gemini might say back).
|
||
# Loaded ONCE at module import; rebuilds the runtime tables below.
|
||
# Adding a new accent / variant / action is a JSON-only edit — no Python
|
||
# change required.
|
||
import json as _json
|
||
|
||
|
||
def _load_instructions() -> dict:
|
||
path = os.path.join(PROJECT_ROOT, "Config", "instruction.json")
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
return _json.load(f) or {}
|
||
except Exception as e:
|
||
# Fail soft — empty tables mean the dispatch gate just rejects
|
||
# everything and is_running stays True. Better than a crash.
|
||
try:
|
||
log.error("instruction.json not loaded: %s", e)
|
||
except Exception:
|
||
pass
|
||
return {}
|
||
|
||
|
||
_INSTRUCTIONS = _load_instructions()
|
||
|
||
|
||
def _build_wake_words(data: dict) -> set:
|
||
out = set()
|
||
wake = data.get("wake_words", {}) or {}
|
||
for lang in ("english", "arabic"):
|
||
for w in wake.get(lang, []) or []:
|
||
if isinstance(w, str) and w.strip():
|
||
out.add(w.strip().lower())
|
||
return out
|
||
|
||
|
||
def _build_command_vocab(data: dict) -> list:
|
||
"""English-only canonical phrases — used by the difflib fuzzy matcher.
|
||
Includes every action's `canonical` plus all entries from
|
||
`user_phrases.english` (deduped, original order preserved per action)."""
|
||
seen = set()
|
||
out = []
|
||
for action in (data.get("actions", {}) or {}).values():
|
||
canon = (action.get("canonical") or "").strip()
|
||
if canon and canon not in seen:
|
||
seen.add(canon)
|
||
out.append(canon)
|
||
for p in (action.get("user_phrases", {}) or {}).get("english", []) or []:
|
||
p = (p or "").strip()
|
||
if p and p not in seen:
|
||
seen.add(p)
|
||
out.append(p)
|
||
return out
|
||
|
||
|
||
def _build_arabic_motion_map(data: dict) -> dict:
|
||
"""Map Arabic user phrase → English canonical for every action."""
|
||
out = {}
|
||
for action in (data.get("actions", {}) or {}).values():
|
||
canon = (action.get("canonical") or "").strip()
|
||
if not canon:
|
||
continue
|
||
for p in (action.get("user_phrases", {}) or {}).get("arabic", []) or []:
|
||
p = (p or "").strip()
|
||
if p:
|
||
out[p] = canon
|
||
return out
|
||
|
||
|
||
def _build_bot_motion_patterns(data: dict) -> list:
|
||
"""List of (needle, canonical) the bot dispatcher matches Gemini's
|
||
spoken reply against. English needles are lowercased so the dispatcher
|
||
can use case-insensitive `in` checks; Arabic needles are kept verbatim."""
|
||
out = []
|
||
for action in (data.get("actions", {}) or {}).values():
|
||
canon = (action.get("canonical") or "").strip()
|
||
if not canon:
|
||
continue
|
||
bot = action.get("bot_phrases", {}) or {}
|
||
for p in bot.get("english", []) or []:
|
||
p = (p or "").strip()
|
||
if p:
|
||
out.append((p.lower(), canon))
|
||
for p in bot.get("arabic", []) or []:
|
||
p = (p or "").strip()
|
||
if p:
|
||
out.append((p, canon))
|
||
# Sort by needle length descending so multi-word phrases match before
|
||
# their shorter prefixes (e.g. "moving forward" before "moving").
|
||
out.sort(key=lambda x: len(x[0]), reverse=True)
|
||
return out
|
||
|
||
|
||
# Module-level vocabulary tables, all derived from instruction.json.
|
||
# Mutable (rebuildable) — VoiceModule.__init__ re-reads in case the file
|
||
# changed since import.
|
||
WAKE_WORDS: set = _build_wake_words(_INSTRUCTIONS)
|
||
COMMAND_VOCAB: list = _build_command_vocab(_INSTRUCTIONS)
|
||
_ARABIC_MOTION_TO_CANONICAL: dict = _build_arabic_motion_map(_INSTRUCTIONS)
|
||
_BOT_MOTION_PATTERNS: list = _build_bot_motion_patterns(_INSTRUCTIONS)
|
||
|
||
# Garbage patterns + min length stay in config_Voice.json (they're
|
||
# noise filtering, not voice instructions).
|
||
GARBAGE_PATTERNS: set = set()
|
||
_MIN_TRANSCRIPTION_LENGTH: int = 3
|
||
|
||
|
||
def _has_wake_word(text: str) -> bool:
|
||
"""True if `text` contains any wake-word variant as a whole word."""
|
||
low = text.lower()
|
||
for w in WAKE_WORDS:
|
||
if re.search(r'\b' + re.escape(w) + r'\b', low):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _strip_wake_word_once(text: str) -> str:
|
||
"""Single pass of wake-word stripping. Use via _strip_wake_word()."""
|
||
stripped = text.strip()
|
||
for w in WAKE_WORDS:
|
||
if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE):
|
||
return ""
|
||
for w in sorted(WAKE_WORDS, key=len, reverse=True):
|
||
m = re.match(
|
||
rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$',
|
||
text, re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return m.group(1).strip(' ,.!?')
|
||
m = re.match(
|
||
rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$',
|
||
text, re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return m.group(1).strip(' ,.!?')
|
||
return text
|
||
|
||
|
||
def _strip_wake_word(text: str) -> str:
|
||
"""
|
||
Remove the wake word from the start or end of text, iteratively, so
|
||
repeated-wake transcriptions ("Sanad. Sanad.") fully collapse.
|
||
Capped at 5 passes to prevent pathological inputs from looping.
|
||
"""
|
||
for _ in range(5):
|
||
stripped = _strip_wake_word_once(text)
|
||
if stripped == text:
|
||
return text
|
||
text = stripped
|
||
return text
|
||
|
||
|
||
def _translate_arabic_motion(text: str) -> str:
|
||
"""Translate Arabic motion phrases to English canonical equivalents,
|
||
using the table built from instruction.json::actions[*].user_phrases.arabic.
|
||
Substring match; longest-needle wins so multi-word phrases match before
|
||
their shorter prefixes."""
|
||
s = text.strip()
|
||
if not s or not _ARABIC_MOTION_TO_CANONICAL:
|
||
return text
|
||
for ar in sorted(_ARABIC_MOTION_TO_CANONICAL.keys(), key=len, reverse=True):
|
||
if ar in s:
|
||
return _ARABIC_MOTION_TO_CANONICAL[ar]
|
||
return text
|
||
|
||
|
||
def _closest_command(text: str, cutoff: float = 0.72) -> str:
|
||
"""
|
||
Map a transcription to the closest known command phrase.
|
||
Returns the canonical command if there's a close-enough match, else
|
||
returns the original text unchanged.
|
||
"""
|
||
low = text.lower().strip().rstrip(".!?,")
|
||
if not low:
|
||
return text
|
||
|
||
for cmd in COMMAND_VOCAB:
|
||
if cmd in low:
|
||
return cmd
|
||
|
||
best_cmd = None
|
||
best_ratio = 0.0
|
||
for cmd in COMMAND_VOCAB:
|
||
r = SequenceMatcher(None, low, cmd).ratio()
|
||
if r > best_ratio:
|
||
best_ratio = r
|
||
best_cmd = cmd
|
||
|
||
if best_ratio >= cutoff:
|
||
return best_cmd
|
||
return text
|
||
|
||
|
||
class VoiceModule:
|
||
"""Thin orchestrator around GeminiBrain + command dispatch."""
|
||
|
||
def __init__(
|
||
self,
|
||
audio_api,
|
||
on_command: Optional[Callable] = None,
|
||
on_wake: Optional[Callable] = None,
|
||
):
|
||
self._audio = audio_api
|
||
self._on_command = on_command
|
||
self._on_wake = on_wake
|
||
|
||
self._config = load_config("Voice")
|
||
self._stt = self._config.get("stt", {})
|
||
self._messages = self._config.get("messages", {})
|
||
|
||
# Reload instruction.json so a hot-edit between runs is picked
|
||
# up without re-importing the module. All four phrase tables
|
||
# (wake_words, command_vocab, Arabic→canonical map, bot motion
|
||
# patterns) are rebuilt from instruction.json — single source of
|
||
# truth. Garbage_patterns + min_transcription_length stay in
|
||
# config_Voice.json (those are noise filtering, not voice
|
||
# instruction tables).
|
||
global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, \
|
||
_MIN_TRANSCRIPTION_LENGTH, _ARABIC_MOTION_TO_CANONICAL, \
|
||
_BOT_MOTION_PATTERNS, _INSTRUCTIONS
|
||
_INSTRUCTIONS = _load_instructions()
|
||
WAKE_WORDS = _build_wake_words(_INSTRUCTIONS)
|
||
COMMAND_VOCAB = _build_command_vocab(_INSTRUCTIONS)
|
||
_ARABIC_MOTION_TO_CANONICAL = _build_arabic_motion_map(_INSTRUCTIONS)
|
||
_BOT_MOTION_PATTERNS = _build_bot_motion_patterns(_INSTRUCTIONS)
|
||
GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])}
|
||
_MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3))
|
||
self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72))
|
||
log.info(
|
||
"instruction.json loaded: %d wake_words, %d command_vocab, "
|
||
"%d arabic→canonical, %d bot patterns; "
|
||
"+ %d garbage_patterns from config_Voice.json",
|
||
len(WAKE_WORDS), len(COMMAND_VOCAB),
|
||
len(_ARABIC_MOTION_TO_CANONICAL), len(_BOT_MOTION_PATTERNS),
|
||
len(GARBAGE_PATTERNS),
|
||
)
|
||
|
||
# Dispatch dedup state: Gemini's input_transcription can fire
|
||
# multiple times per turn (streaming partials). Track the last
|
||
# canonical command + timestamp so we don't move twice.
|
||
self._last_gemini_canon = ""
|
||
self._last_gemini_dispatch_at = 0.0
|
||
|
||
# Wake-word gate state. Motion (whether triggered from the user's
|
||
# transcript OR from Gemini's verbal confirmation) only fires
|
||
# within a turn whose user transcript contained "Sanad" / "سند"
|
||
# at least once. Reset on turn_end. This is the safety rule:
|
||
# actions ALWAYS require the wake word, in either language.
|
||
self._wake_active_for_turn = False
|
||
|
||
# Gemini brain reference for flush_mic() — populated by
|
||
# _voice_loop_gemini after spawning the runner subprocess.
|
||
self._brain = None
|
||
|
||
# Per-turn buffer for Gemini's spoken text. Gemini Live emits the
|
||
# output transcription in many small chunks ("I", "see", "a", ...);
|
||
# we accumulate them and print one clean `[Sanad] said: "..."`
|
||
# line per turn. Flushed on turn_end OR when a chunk ends with
|
||
# sentence-ending punctuation.
|
||
self._gemini_say_buf = []
|
||
self._gemini_say_lock = threading.Lock()
|
||
self._gemini_say_last_chunk_at = 0.0
|
||
|
||
self._running = False
|
||
self._thread = None
|
||
|
||
log.info("VoiceModule initialized (backend=gemini)")
|
||
|
||
# ─── main loop ────────────────────────────────────────
|
||
|
||
def _voice_loop(self):
|
||
"""
|
||
Spawn the Gemini Live STT subprocess (runs in the gemini_sdk
|
||
Python 3.10+ env) and forward its transcripts into Marcus's
|
||
dispatch gate. Marcus's main process never opens the Gemini
|
||
WebSocket itself — google-genai needs Python ≥3.9 and marcus
|
||
is pinned to 3.8 by the Jetson torch wheel.
|
||
"""
|
||
api_key = (
|
||
os.environ.get("MARCUS_GEMINI_API_KEY")
|
||
or os.environ.get("SANAD_GEMINI_API_KEY")
|
||
or self._stt.get("gemini_api_key", "")
|
||
)
|
||
if not api_key:
|
||
log.error(
|
||
"No Gemini API key found. Set env MARCUS_GEMINI_API_KEY "
|
||
"or stt.gemini_api_key in Config/config_Voice.json"
|
||
)
|
||
while self._running:
|
||
time.sleep(0.5)
|
||
return
|
||
|
||
from Voice.gemini_script import GeminiBrain
|
||
|
||
# Env overrides for model + voice are passed through to the
|
||
# runner subprocess automatically (it reads the same env vars).
|
||
model = (
|
||
os.environ.get("MARCUS_GEMINI_MODEL")
|
||
or self._stt.get(
|
||
"gemini_model",
|
||
"gemini-2.5-flash-native-audio-preview-12-2025",
|
||
)
|
||
)
|
||
voice_name = (
|
||
os.environ.get("MARCUS_GEMINI_VOICE")
|
||
or self._stt.get("gemini_voice_name", "Charon")
|
||
)
|
||
# System prompt: the runner reads the same config & file paths,
|
||
# but we forward the resolved string in case marcus's config layer
|
||
# picked a fallback. Forwarded via env in GeminiBrain.start().
|
||
system_prompt = self._stt.get(
|
||
"gemini_system_prompt",
|
||
"Transcribe what the user says to Sanad. Stay silent.",
|
||
)
|
||
sp_file = self._stt.get("gemini_system_prompt_file", "")
|
||
if sp_file:
|
||
sp_path = sp_file if os.path.isabs(sp_file) else os.path.join(
|
||
PROJECT_ROOT, sp_file,
|
||
)
|
||
try:
|
||
with open(sp_path, "r", encoding="utf-8") as f:
|
||
loaded = f.read().strip()
|
||
if loaded:
|
||
system_prompt = loaded
|
||
log.info(
|
||
"gemini system prompt loaded from %s (%d chars)",
|
||
sp_path, len(loaded),
|
||
)
|
||
except Exception as e:
|
||
log.warning(
|
||
"gemini_system_prompt_file=%r unreadable: %s — "
|
||
"using inline config", sp_file, e,
|
||
)
|
||
|
||
log.info(
|
||
"Voice loop started — GEMINI S2S subprocess "
|
||
"(model=%s, voice=%s)", model, voice_name,
|
||
)
|
||
|
||
brain = GeminiBrain(
|
||
None, None, # audio_io, recorder owned by runner
|
||
voice_name=voice_name,
|
||
system_prompt=system_prompt,
|
||
api_key=api_key,
|
||
on_transcript=self._on_gemini_transcript,
|
||
on_command=self._dispatch_gemini_command,
|
||
on_bot_text=self._on_gemini_say_chunk,
|
||
on_turn_end=self._on_gemini_turn_end,
|
||
)
|
||
self._brain = brain
|
||
brain.start()
|
||
|
||
# ── Camera-frame sender ────────────────────────────────────
|
||
# Stream JPEG frames to the runner so Gemini Live can SEE what
|
||
# the robot sees. Without this, "what do you see" / "describe
|
||
# this exhibit" answers would be hallucinations. The runner
|
||
# forwards them to Gemini as image/jpeg blobs and de-stales
|
||
# anything older than gemini_frame_max_age_sec.
|
||
send_frames = bool(self._stt.get("gemini_send_frames", True))
|
||
frame_interval = float(self._stt.get("gemini_frame_interval_sec", 0.5))
|
||
frame_thread = None
|
||
frame_stop = threading.Event()
|
||
if send_frames:
|
||
try:
|
||
from API.camera_api import get_frame as _camera_get_frame
|
||
except Exception as e:
|
||
log.warning("camera_api unavailable — frame streaming disabled: %s", e)
|
||
_camera_get_frame = None
|
||
if _camera_get_frame is not None:
|
||
def _frame_sender_loop():
|
||
log.info(
|
||
"frame sender started — %.2fs interval, "
|
||
"streaming camera frames to Gemini Live",
|
||
frame_interval,
|
||
)
|
||
while not frame_stop.is_set() and self._running:
|
||
try:
|
||
frame_b64 = _camera_get_frame()
|
||
if frame_b64:
|
||
# camera_api returns a base64 ASCII string —
|
||
# GeminiBrain.send_frame accepts that directly.
|
||
brain.send_frame(frame_b64)
|
||
except Exception as e:
|
||
log.debug("frame send failed: %s", e)
|
||
frame_stop.wait(frame_interval)
|
||
log.info("frame sender stopped")
|
||
|
||
frame_thread = threading.Thread(
|
||
target=_frame_sender_loop,
|
||
daemon=True, name="gemini-frames",
|
||
)
|
||
frame_thread.start()
|
||
|
||
try:
|
||
while self._running:
|
||
time.sleep(0.25)
|
||
finally:
|
||
frame_stop.set()
|
||
if frame_thread is not None:
|
||
frame_thread.join(timeout=2)
|
||
brain.stop()
|
||
self._brain = None
|
||
|
||
# ─── dispatch side channel ────────────────────────────
|
||
|
||
def _on_gemini_transcript(self, text: str) -> None:
|
||
"""Log every user transcript to logs/transcript.log."""
|
||
if text:
|
||
_log_transcript("HEARD", text)
|
||
|
||
def _on_gemini_say_chunk(self, text: str) -> None:
|
||
"""
|
||
Receive a Gemini output-transcription chunk. Two side effects:
|
||
1. Forward to the bot dispatcher so motion can fire on
|
||
confirmation phrases (Turning right / Sitting down / etc.).
|
||
2. Buffer the chunk for the per-turn `[Sanad] said: ...` line
|
||
that prints once on turn_end (or sooner if the chunk ends
|
||
with sentence punctuation).
|
||
"""
|
||
# Motion side-channel — chunk-level so dispatch is fast.
|
||
try:
|
||
self._dispatch_gemini_bot(text)
|
||
except Exception:
|
||
pass
|
||
|
||
with self._gemini_say_lock:
|
||
self._gemini_say_buf.append(text)
|
||
self._gemini_say_last_chunk_at = time.time()
|
||
# Flush early if this chunk closes a sentence — typical for
|
||
# short acks like "Turning right." that arrive as one chunk.
|
||
if text.rstrip().endswith((".", "!", "?")):
|
||
self._flush_gemini_say_locked()
|
||
|
||
def _on_gemini_turn_end(self) -> None:
|
||
"""Flush any pending Gemini output chunks at turn boundary,
|
||
and close the wake-word motion gate for the next turn."""
|
||
with self._gemini_say_lock:
|
||
self._flush_gemini_say_locked()
|
||
self._wake_active_for_turn = False
|
||
|
||
def _flush_gemini_say_locked(self) -> None:
|
||
"""Caller MUST hold self._gemini_say_lock. Prints one [Sanad] said: line."""
|
||
if not self._gemini_say_buf:
|
||
return
|
||
joined = " ".join(t.strip() for t in self._gemini_say_buf if t).strip()
|
||
while " " in joined:
|
||
joined = joined.replace(" ", " ")
|
||
self._gemini_say_buf = []
|
||
if joined:
|
||
_log_transcript("SAID", joined)
|
||
try:
|
||
print(f' [Sanad] said: "{joined[:200]}"')
|
||
print("Command: ", end="", flush=True)
|
||
except Exception:
|
||
pass
|
||
|
||
def _dispatch_gemini_command(self, text: str, lang: str = "en") -> None:
|
||
"""
|
||
Fire self._on_command for any transcript prefixed with the wake
|
||
word "Sanad". Marcus's brain is the authoritative decision maker
|
||
in the STT-only architecture — it handles motion AND Q&A AND
|
||
vision queries AND replies via TtsMaker.
|
||
|
||
The vocab-match gate has been dropped: if the user says
|
||
"Sanad, what's the weather" the transcript still reaches the
|
||
brain, which either answers via its VLM or declines. This keeps
|
||
all Gemini-heard queries routed through one place (Marcus) and
|
||
removes the audio collision that full S2S had.
|
||
|
||
Examples:
|
||
"Sanad, turn right" → strip → "turn right" → brain → motion
|
||
"Sanad, what do you see"→ strip → "what do you see" → brain → VLM
|
||
"Sanad" → bare wake → skip (no payload)
|
||
"turn right" → no wake word → skip (conversation gate)
|
||
|
||
Dedup: Gemini emits streaming partials; same normalized command
|
||
within command_cooldown_sec fires only once.
|
||
"""
|
||
if not text or not _has_wake_word(text):
|
||
return
|
||
|
||
# Wake word found — open the motion gate for this turn so the
|
||
# bot-side dispatcher (Gemini's spoken confirmation) is also
|
||
# allowed to fire. Reset on turn_end.
|
||
self._wake_active_for_turn = True
|
||
|
||
stripped = _strip_wake_word(text)
|
||
if not stripped or len(stripped.strip()) < _MIN_TRANSCRIPTION_LENGTH:
|
||
return
|
||
|
||
# Bilingual support: translate Arabic motion phrases to their
|
||
# English canonical equivalent BEFORE fuzzy-matching. Marcus's
|
||
# command_parser is English-only.
|
||
stripped = _translate_arabic_motion(stripped)
|
||
|
||
low = stripped.lower().strip().rstrip(".!?,")
|
||
vocab_exact = {c.lower() for c in COMMAND_VOCAB}
|
||
if low in GARBAGE_PATTERNS and low not in vocab_exact:
|
||
return
|
||
|
||
# Fuzzy-normalize (maps "turn right up" → "turn right") if the
|
||
# transcript is close to a vocab entry — but unlike before, we
|
||
# forward everything that passed the wake-word gate, not just
|
||
# vocab hits. Marcus's command_parser + VLM handles the rest.
|
||
command = self._normalize_command(stripped)
|
||
canon = command.lower().strip().rstrip(".!?,")
|
||
|
||
now = time.time()
|
||
cooldown = float(self._stt.get("command_cooldown_sec", 1.5))
|
||
if (canon == self._last_gemini_canon
|
||
and now - self._last_gemini_dispatch_at < cooldown):
|
||
return
|
||
self._last_gemini_canon = canon
|
||
self._last_gemini_dispatch_at = now
|
||
|
||
log.info("dispatch (gemini): %s", command[:120])
|
||
_log_transcript("CMD", command)
|
||
if self._on_command:
|
||
try:
|
||
self._on_command(command, lang or "en")
|
||
except Exception as e:
|
||
log.error("on_command error: %s", e, exc_info=True)
|
||
|
||
def flush_mic(self) -> None:
|
||
"""
|
||
Tell the Gemini runner subprocess to drop its mic buffer. Called
|
||
before AND after `audio_api.speak()` so the robot's own voice
|
||
(picked up by the mic during TtsMaker playback) doesn't come back
|
||
from Gemini as a fake user utterance.
|
||
No-op if the runner hasn't started yet.
|
||
"""
|
||
b = getattr(self, "_brain", None)
|
||
if b is None:
|
||
return
|
||
try:
|
||
b.flush_mic()
|
||
except Exception:
|
||
pass
|
||
|
||
def _normalize_command(self, text: str) -> str:
|
||
"""Fuzzy-match a transcription to the closest canonical phrase."""
|
||
canonical = _closest_command(text, cutoff=self._vocab_cutoff)
|
||
if canonical != text:
|
||
log.info("fuzzy-match: %r → %r", text, canonical)
|
||
return canonical
|
||
|
||
# _BOT_MOTION_PATTERNS is built at module load from
|
||
# Config/instruction.json::actions[*].bot_phrases (both English and
|
||
# Arabic). The dispatcher reads it via the module-level reference.
|
||
@property
|
||
def _BOT_MOTION_PATTERNS(self):
|
||
return _BOT_MOTION_PATTERNS
|
||
|
||
def _dispatch_gemini_bot(self, text: str) -> None:
|
||
"""
|
||
Dispatch motion when Gemini's spoken reply contains a known motion-
|
||
confirmation pattern (English or Arabic). Strict gate: only fires
|
||
if the current turn's user transcript already passed the wake-word
|
||
check (`_wake_active_for_turn`) — so motion ALWAYS requires "Sanad"
|
||
/ "سند" somewhere in the user's request. Dedups against the
|
||
user-transcript path so the same command can't fire twice.
|
||
"""
|
||
if not text:
|
||
return
|
||
# SAFETY GATE: motion only when wake word was heard this turn.
|
||
if not self._wake_active_for_turn:
|
||
return
|
||
|
||
low = text.strip().rstrip(".!?,").lower()
|
||
# Note: we keep both English (lowercased) and Arabic patterns;
|
||
# Arabic is unaffected by .lower() since it has no case.
|
||
if not low:
|
||
return
|
||
|
||
canon = None
|
||
for needle, cmd in self._BOT_MOTION_PATTERNS:
|
||
# Match-anywhere so leading filler ("OK, " / "حسناً، ") doesn't
|
||
# block the recognition. English needles are lowercase; Arabic
|
||
# needles match as-is.
|
||
if needle in low or needle in text:
|
||
canon = cmd
|
||
break
|
||
if canon is None:
|
||
return
|
||
|
||
now = time.time()
|
||
cooldown = float(self._stt.get("command_cooldown_sec", 1.5))
|
||
if (canon == self._last_gemini_canon
|
||
and now - self._last_gemini_dispatch_at < cooldown):
|
||
return
|
||
self._last_gemini_canon = canon
|
||
self._last_gemini_dispatch_at = now
|
||
|
||
log.info("dispatch (gemini-bot): %s (heard: %r)", canon, text[:80])
|
||
_log_transcript("CMD-BOT", canon)
|
||
if self._on_command:
|
||
try:
|
||
self._on_command(canon, "en")
|
||
except Exception as e:
|
||
log.error("on_command error: %s", e, exc_info=True)
|
||
|
||
# ─── start / stop ─────────────────────────────────────
|
||
|
||
def start(self):
|
||
if self._running:
|
||
log.warning("VoiceModule already running")
|
||
return
|
||
self._running = True
|
||
self._thread = threading.Thread(
|
||
target=self._voice_loop, daemon=True, name="voice",
|
||
)
|
||
self._thread.start()
|
||
log.info("Voice module started")
|
||
|
||
def stop(self):
|
||
self._running = False
|
||
if self._thread:
|
||
self._thread.join(timeout=5)
|
||
self._thread = None
|
||
log.info("Voice module stopped")
|
||
|
||
@property
|
||
def is_running(self) -> bool:
|
||
"""True while the voice loop thread is alive."""
|
||
t = self._thread
|
||
return bool(self._running and t is not None and t.is_alive())
|
||
|
||
@property
|
||
def is_speaking(self) -> bool:
|
||
"""Delegates to AudioAPI — True while TtsMaker is playing."""
|
||
try:
|
||
return bool(self._audio.is_speaking)
|
||
except Exception:
|
||
return False
|