779 lines
31 KiB
Python
779 lines
31 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Voice/marcus_voice.py — Gemini Live voice orchestrator for Marcus.
|
||
|
||
Pipeline:
|
||
|
||
G1 mic UDP ──► BuiltinMic (Voice/audio_io.py)
|
||
│
|
||
▼
|
||
GeminiBrain (Voice/gemini_script.py — subprocess in gemini_sdk env)
|
||
│ audio out (24 kHz, Gemini's voice through G1 speaker)
|
||
│ output_transcription (chunks of Gemini's reply text)
|
||
▼
|
||
_dispatch_gemini_bot (Marcus side-channel)
|
||
- match Gemini's spoken phrase against
|
||
Config/instruction.json::actions[*].bot_phrases
|
||
- dedup canonical command within command_cooldown_sec
|
||
│
|
||
▼
|
||
on_command(canonical, "en") ──► Marcus brain → motion
|
||
|
||
GEMINI IS THE WAKE-WORD GATEKEEPER. The persona prompt in
|
||
Config/config_Voice.json::gemini_system_prompt instructs Gemini to ONLY
|
||
speak motion-confirmation phrases ('Turning right', 'أستدير يميناً')
|
||
when the user said the wake word ('Sanad' or 'سند') AND requested an
|
||
action. If Gemini says a motion phrase, Marcus trusts that decision and
|
||
fires the matching robot motion. Marcus does NOT inspect user transcripts
|
||
for wake words anymore — single gatekeeper, no double-checking.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import os
|
||
import re
|
||
import sys
|
||
import threading
|
||
import time
|
||
from difflib import SequenceMatcher
|
||
from logging.handlers import RotatingFileHandler
|
||
from typing import Callable, Optional
|
||
|
||
import numpy as np
|
||
|
||
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
if _PROJECT_DIR not in sys.path:
|
||
sys.path.insert(0, _PROJECT_DIR)
|
||
from Core.env_loader import PROJECT_ROOT
|
||
from Core.config_loader import load_config
|
||
|
||
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
||
os.makedirs(LOG_DIR, exist_ok=True)
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||
handlers=[
|
||
RotatingFileHandler(
|
||
os.path.join(LOG_DIR, "voice.log"),
|
||
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
),
|
||
],
|
||
)
|
||
log = logging.getLogger("marcus_voice")
|
||
|
||
|
||
# ── Transcript log ─────────────────────────────────────────────
|
||
# Every user transcript Gemini emits is written here in a simple
|
||
# one-line-per-entry format. Rotates every 5 MB × 3 backups.
|
||
_TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log")
|
||
_transcript_log = logging.getLogger("transcript")
|
||
_transcript_log.setLevel(logging.INFO)
|
||
_transcript_log.propagate = False
|
||
if not _transcript_log.handlers:
|
||
_th = RotatingFileHandler(
|
||
_TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
)
|
||
_th.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
|
||
_transcript_log.addHandler(_th)
|
||
|
||
|
||
def _log_transcript(action: str, text: str) -> None:
|
||
_transcript_log.info("%-5s %s", action, (text or "").strip())
|
||
|
||
|
||
# ─── instruction.json — bilingual phrase tables ──────────────────
|
||
#
|
||
# Single source of truth for every voice phrase the dispatch layer cares
|
||
# about: wake-word variants (EN + AR), per-action user_phrases (what the
|
||
# user might say), per-action bot_phrases (what Gemini might say back).
|
||
# Loaded ONCE at module import; rebuilds the runtime tables below.
|
||
# Adding a new accent / variant / action is a JSON-only edit — no Python
|
||
# change required.
|
||
import json as _json
|
||
|
||
|
||
def _load_instructions() -> dict:
|
||
path = os.path.join(PROJECT_ROOT, "Config", "instruction.json")
|
||
try:
|
||
with open(path, "r", encoding="utf-8") as f:
|
||
return _json.load(f) or {}
|
||
except Exception as e:
|
||
# Fail soft — empty tables mean the dispatch gate just rejects
|
||
# everything and is_running stays True. Better than a crash.
|
||
try:
|
||
log.error("instruction.json not loaded: %s", e)
|
||
except Exception:
|
||
pass
|
||
return {}
|
||
|
||
|
||
_INSTRUCTIONS = _load_instructions()
|
||
|
||
|
||
def _build_wake_words(data: dict) -> set:
|
||
out = set()
|
||
wake = data.get("wake_words", {}) or {}
|
||
for lang in ("english", "arabic"):
|
||
for w in wake.get(lang, []) or []:
|
||
if isinstance(w, str) and w.strip():
|
||
out.add(w.strip().lower())
|
||
return out
|
||
|
||
|
||
def _build_command_vocab(data: dict) -> list:
|
||
"""English-only canonical phrases — used by the difflib fuzzy matcher.
|
||
Includes every action's `canonical` plus all entries from
|
||
`user_phrases.english` (deduped, original order preserved per action)."""
|
||
seen = set()
|
||
out = []
|
||
for action in (data.get("actions", {}) or {}).values():
|
||
canon = (action.get("canonical") or "").strip()
|
||
if canon and canon not in seen:
|
||
seen.add(canon)
|
||
out.append(canon)
|
||
for p in (action.get("user_phrases", {}) or {}).get("english", []) or []:
|
||
p = (p or "").strip()
|
||
if p and p not in seen:
|
||
seen.add(p)
|
||
out.append(p)
|
||
return out
|
||
|
||
|
||
def _build_arabic_motion_map(data: dict) -> dict:
|
||
"""Map Arabic user phrase → English canonical for every action."""
|
||
out = {}
|
||
for action in (data.get("actions", {}) or {}).values():
|
||
canon = (action.get("canonical") or "").strip()
|
||
if not canon:
|
||
continue
|
||
for p in (action.get("user_phrases", {}) or {}).get("arabic", []) or []:
|
||
p = (p or "").strip()
|
||
if p:
|
||
out[p] = canon
|
||
return out
|
||
|
||
|
||
def _build_parametric_actions(data: dict) -> list:
|
||
"""Compile parametric_actions from instruction.json into a list of
|
||
(compiled_regex, command_template) tuples. Every regex carries
|
||
re.IGNORECASE so 'Turning' / 'turning' / 'TURNING' all match. Arabic
|
||
is unaffected by IGNORECASE.
|
||
|
||
Bad entries (missing regex/template, regex compile errors) are
|
||
skipped with a warning — the rest of the table still loads.
|
||
"""
|
||
out = []
|
||
for entry in (data.get("parametric_actions", []) or []):
|
||
if not isinstance(entry, dict):
|
||
continue
|
||
rx = entry.get("regex")
|
||
tpl = entry.get("command_template")
|
||
if not rx or not tpl:
|
||
continue
|
||
try:
|
||
compiled = re.compile(rx, re.IGNORECASE)
|
||
except re.error as e:
|
||
try:
|
||
log.warning("parametric_actions skipped (bad regex %r): %s", rx, e)
|
||
except Exception:
|
||
pass
|
||
continue
|
||
out.append((compiled, tpl))
|
||
return out
|
||
|
||
|
||
def _format_param_template(template: str, groups: tuple) -> str:
|
||
"""Substitute $1, $2, … in a parametric command template with the
|
||
matching regex groups. None groups (optional captures that didn't
|
||
fire) are treated as empty strings; surrounding whitespace is
|
||
collapsed so 'turn 90 degrees' becomes 'turn 90 degrees'."""
|
||
out = template
|
||
for i, g in enumerate(groups, 1):
|
||
token = "$" + str(i)
|
||
out = out.replace(token, "" if g is None else str(g))
|
||
out = re.sub(r"\s+", " ", out).strip()
|
||
return out
|
||
|
||
|
||
def _build_bot_motion_patterns(data: dict) -> list:
|
||
"""List of (needle, canonical) the bot dispatcher matches Gemini's
|
||
spoken reply against. English needles are lowercased so the dispatcher
|
||
can use case-insensitive `in` checks; Arabic needles are kept verbatim."""
|
||
out = []
|
||
for action in (data.get("actions", {}) or {}).values():
|
||
canon = (action.get("canonical") or "").strip()
|
||
if not canon:
|
||
continue
|
||
bot = action.get("bot_phrases", {}) or {}
|
||
for p in bot.get("english", []) or []:
|
||
p = (p or "").strip()
|
||
if p:
|
||
out.append((p.lower(), canon))
|
||
for p in bot.get("arabic", []) or []:
|
||
p = (p or "").strip()
|
||
if p:
|
||
out.append((p, canon))
|
||
# Sort by needle length descending so multi-word phrases match before
|
||
# their shorter prefixes (e.g. "moving forward" before "moving").
|
||
out.sort(key=lambda x: len(x[0]), reverse=True)
|
||
return out
|
||
|
||
|
||
# Module-level vocabulary tables, all derived from instruction.json.
|
||
# Mutable (rebuildable) — VoiceModule.__init__ re-reads in case the file
|
||
# changed since import.
|
||
WAKE_WORDS: set = _build_wake_words(_INSTRUCTIONS)
|
||
COMMAND_VOCAB: list = _build_command_vocab(_INSTRUCTIONS)
|
||
_ARABIC_MOTION_TO_CANONICAL: dict = _build_arabic_motion_map(_INSTRUCTIONS)
|
||
_BOT_MOTION_PATTERNS: list = _build_bot_motion_patterns(_INSTRUCTIONS)
|
||
_PARAMETRIC_ACTIONS: list = _build_parametric_actions(_INSTRUCTIONS)
|
||
|
||
# Garbage patterns + min length stay in config_Voice.json (they're
|
||
# noise filtering, not voice instructions).
|
||
GARBAGE_PATTERNS: set = set()
|
||
_MIN_TRANSCRIPTION_LENGTH: int = 3
|
||
|
||
|
||
def _has_wake_word(text: str) -> bool:
|
||
"""True if `text` contains any wake-word variant as a whole word."""
|
||
low = text.lower()
|
||
for w in WAKE_WORDS:
|
||
if re.search(r'\b' + re.escape(w) + r'\b', low):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _strip_wake_word_once(text: str) -> str:
|
||
"""Single pass of wake-word stripping. Use via _strip_wake_word()."""
|
||
stripped = text.strip()
|
||
for w in WAKE_WORDS:
|
||
if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE):
|
||
return ""
|
||
for w in sorted(WAKE_WORDS, key=len, reverse=True):
|
||
m = re.match(
|
||
rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$',
|
||
text, re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return m.group(1).strip(' ,.!?')
|
||
m = re.match(
|
||
rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$',
|
||
text, re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return m.group(1).strip(' ,.!?')
|
||
return text
|
||
|
||
|
||
def _strip_wake_word(text: str) -> str:
|
||
"""
|
||
Remove the wake word from the start or end of text, iteratively, so
|
||
repeated-wake transcriptions ("Sanad. Sanad.") fully collapse.
|
||
Capped at 5 passes to prevent pathological inputs from looping.
|
||
"""
|
||
for _ in range(5):
|
||
stripped = _strip_wake_word_once(text)
|
||
if stripped == text:
|
||
return text
|
||
text = stripped
|
||
return text
|
||
|
||
|
||
def _translate_arabic_motion(text: str) -> str:
|
||
"""Translate Arabic motion phrases to English canonical equivalents,
|
||
using the table built from instruction.json::actions[*].user_phrases.arabic.
|
||
Substring match; longest-needle wins so multi-word phrases match before
|
||
their shorter prefixes."""
|
||
s = text.strip()
|
||
if not s or not _ARABIC_MOTION_TO_CANONICAL:
|
||
return text
|
||
for ar in sorted(_ARABIC_MOTION_TO_CANONICAL.keys(), key=len, reverse=True):
|
||
if ar in s:
|
||
return _ARABIC_MOTION_TO_CANONICAL[ar]
|
||
return text
|
||
|
||
|
||
def _closest_command(text: str, cutoff: float = 0.72) -> str:
|
||
"""
|
||
Map a transcription to the closest known command phrase.
|
||
Returns the canonical command if there's a close-enough match, else
|
||
returns the original text unchanged.
|
||
"""
|
||
low = text.lower().strip().rstrip(".!?,")
|
||
if not low:
|
||
return text
|
||
|
||
for cmd in COMMAND_VOCAB:
|
||
if cmd in low:
|
||
return cmd
|
||
|
||
best_cmd = None
|
||
best_ratio = 0.0
|
||
for cmd in COMMAND_VOCAB:
|
||
r = SequenceMatcher(None, low, cmd).ratio()
|
||
if r > best_ratio:
|
||
best_ratio = r
|
||
best_cmd = cmd
|
||
|
||
if best_ratio >= cutoff:
|
||
return best_cmd
|
||
return text
|
||
|
||
|
||
class VoiceModule:
|
||
"""Thin orchestrator around GeminiBrain + command dispatch."""
|
||
|
||
def __init__(
|
||
self,
|
||
audio_api,
|
||
on_command: Optional[Callable] = None,
|
||
on_wake: Optional[Callable] = None,
|
||
):
|
||
self._audio = audio_api
|
||
self._on_command = on_command
|
||
self._on_wake = on_wake
|
||
|
||
self._config = load_config("Voice")
|
||
self._stt = self._config.get("stt", {})
|
||
self._messages = self._config.get("messages", {})
|
||
|
||
# Reload instruction.json so a hot-edit between runs is picked
|
||
# up without re-importing the module. All four phrase tables
|
||
# (wake_words, command_vocab, Arabic→canonical map, bot motion
|
||
# patterns) are rebuilt from instruction.json — single source of
|
||
# truth. Garbage_patterns + min_transcription_length stay in
|
||
# config_Voice.json (those are noise filtering, not voice
|
||
# instruction tables).
|
||
global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, \
|
||
_MIN_TRANSCRIPTION_LENGTH, _ARABIC_MOTION_TO_CANONICAL, \
|
||
_BOT_MOTION_PATTERNS, _PARAMETRIC_ACTIONS, _INSTRUCTIONS
|
||
_INSTRUCTIONS = _load_instructions()
|
||
WAKE_WORDS = _build_wake_words(_INSTRUCTIONS)
|
||
COMMAND_VOCAB = _build_command_vocab(_INSTRUCTIONS)
|
||
_ARABIC_MOTION_TO_CANONICAL = _build_arabic_motion_map(_INSTRUCTIONS)
|
||
_BOT_MOTION_PATTERNS = _build_bot_motion_patterns(_INSTRUCTIONS)
|
||
_PARAMETRIC_ACTIONS = _build_parametric_actions(_INSTRUCTIONS)
|
||
GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])}
|
||
_MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3))
|
||
self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72))
|
||
log.info(
|
||
"instruction.json loaded: %d wake_words, %d command_vocab, "
|
||
"%d arabic→canonical, %d bot patterns, %d parametric; "
|
||
"+ %d garbage_patterns from config_Voice.json",
|
||
len(WAKE_WORDS), len(COMMAND_VOCAB),
|
||
len(_ARABIC_MOTION_TO_CANONICAL), len(_BOT_MOTION_PATTERNS),
|
||
len(_PARAMETRIC_ACTIONS), len(GARBAGE_PATTERNS),
|
||
)
|
||
|
||
# Dispatch dedup state. Gemini's output_transcription arrives in
|
||
# many small chunks per turn — and one chunk may contain MULTIPLE
|
||
# motion phrases for compound commands ("Turning right, then
|
||
# moving forward."). Two layers of dedup:
|
||
# 1. _fired_canons_this_turn — set of canonicals already fired
|
||
# since the current Gemini turn started; cleared on turn_end.
|
||
# Same motion never fires twice within one reply.
|
||
# 2. _last_gemini_canon / _last_gemini_dispatch_at — cross-chunk
|
||
# cooldown safety net for cases where a turn_end is delayed
|
||
# and the next turn's phrase arrives before the reset.
|
||
self._fired_canons_this_turn: set = set()
|
||
self._last_gemini_canon = ""
|
||
self._last_gemini_dispatch_at = 0.0
|
||
# Wake-word gate is enforced INSIDE Gemini via the persona prompt
|
||
# (see config_Voice.json::gemini_system_prompt). Marcus does NOT
|
||
# check for "Sanad"/"سند" in Python anymore — if Gemini speaks
|
||
# a motion-confirmation phrase, that's its own decision and we
|
||
# trust it. Cleaner, single gatekeeper, no double-checking.
|
||
|
||
# Gemini brain reference for flush_mic() — populated by
|
||
# _voice_loop_gemini after spawning the runner subprocess.
|
||
self._brain = None
|
||
|
||
# Per-turn buffer for Gemini's spoken text. Gemini Live emits the
|
||
# output transcription in many small chunks ("I", "see", "a", ...);
|
||
# we accumulate them and print one clean `[Sanad] said: "..."`
|
||
# line per turn. Flushed on turn_end OR when a chunk ends with
|
||
# sentence-ending punctuation.
|
||
self._gemini_say_buf = []
|
||
self._gemini_say_lock = threading.Lock()
|
||
self._gemini_say_last_chunk_at = 0.0
|
||
|
||
self._running = False
|
||
self._thread = None
|
||
|
||
log.info("VoiceModule initialized (backend=gemini)")
|
||
|
||
# ─── main loop ────────────────────────────────────────
|
||
|
||
def _voice_loop(self):
|
||
"""
|
||
Spawn the Gemini Live STT subprocess (runs in the gemini_sdk
|
||
Python 3.10+ env) and forward its transcripts into Marcus's
|
||
dispatch gate. Marcus's main process never opens the Gemini
|
||
WebSocket itself — google-genai needs Python ≥3.9 and marcus
|
||
is pinned to 3.8 by the Jetson torch wheel.
|
||
"""
|
||
api_key = (
|
||
os.environ.get("MARCUS_GEMINI_API_KEY")
|
||
or os.environ.get("SANAD_GEMINI_API_KEY")
|
||
or self._stt.get("gemini_api_key", "")
|
||
)
|
||
if not api_key:
|
||
log.error(
|
||
"No Gemini API key found. Set env MARCUS_GEMINI_API_KEY "
|
||
"or stt.gemini_api_key in Config/config_Voice.json"
|
||
)
|
||
while self._running:
|
||
time.sleep(0.5)
|
||
return
|
||
|
||
from Voice.gemini_script import GeminiBrain
|
||
|
||
# Env overrides for model + voice are passed through to the
|
||
# runner subprocess automatically (it reads the same env vars).
|
||
model = (
|
||
os.environ.get("MARCUS_GEMINI_MODEL")
|
||
or self._stt.get(
|
||
"gemini_model",
|
||
"gemini-2.5-flash-native-audio-preview-12-2025",
|
||
)
|
||
)
|
||
voice_name = (
|
||
os.environ.get("MARCUS_GEMINI_VOICE")
|
||
or self._stt.get("gemini_voice_name", "Charon")
|
||
)
|
||
# System prompt: the runner reads the same config & file paths,
|
||
# but we forward the resolved string in case marcus's config layer
|
||
# picked a fallback. Forwarded via env in GeminiBrain.start().
|
||
system_prompt = self._stt.get(
|
||
"gemini_system_prompt",
|
||
"Transcribe what the user says to Sanad. Stay silent.",
|
||
)
|
||
sp_file = self._stt.get("gemini_system_prompt_file", "")
|
||
if sp_file:
|
||
sp_path = sp_file if os.path.isabs(sp_file) else os.path.join(
|
||
PROJECT_ROOT, sp_file,
|
||
)
|
||
try:
|
||
with open(sp_path, "r", encoding="utf-8") as f:
|
||
loaded = f.read().strip()
|
||
if loaded:
|
||
system_prompt = loaded
|
||
log.info(
|
||
"gemini system prompt loaded from %s (%d chars)",
|
||
sp_path, len(loaded),
|
||
)
|
||
except Exception as e:
|
||
log.warning(
|
||
"gemini_system_prompt_file=%r unreadable: %s — "
|
||
"using inline config", sp_file, e,
|
||
)
|
||
|
||
log.info(
|
||
"Voice loop started — GEMINI S2S subprocess "
|
||
"(model=%s, voice=%s)", model, voice_name,
|
||
)
|
||
|
||
brain = GeminiBrain(
|
||
None, None, # audio_io, recorder owned by runner
|
||
voice_name=voice_name,
|
||
system_prompt=system_prompt,
|
||
api_key=api_key,
|
||
on_transcript=self._on_gemini_transcript,
|
||
# No on_command — Marcus does NOT inspect user transcripts
|
||
# for wake words anymore. Gemini's persona prompt enforces
|
||
# the wake-word gate; if Gemini speaks a motion phrase, the
|
||
# bot-side dispatcher below picks it up and fires motion.
|
||
on_bot_text=self._on_gemini_say_chunk,
|
||
on_turn_end=self._on_gemini_turn_end,
|
||
)
|
||
self._brain = brain
|
||
brain.start()
|
||
|
||
# ── Camera-frame sender ────────────────────────────────────
|
||
# Stream JPEG frames to the runner so Gemini Live can SEE what
|
||
# the robot sees. Without this, "what do you see" / "describe
|
||
# this exhibit" answers would be hallucinations. The runner
|
||
# forwards them to Gemini as image/jpeg blobs and de-stales
|
||
# anything older than gemini_frame_max_age_sec.
|
||
send_frames = bool(self._stt.get("gemini_send_frames", True))
|
||
frame_interval = float(self._stt.get("gemini_frame_interval_sec", 0.5))
|
||
frame_thread = None
|
||
frame_stop = threading.Event()
|
||
if send_frames:
|
||
try:
|
||
from API.camera_api import get_frame as _camera_get_frame
|
||
except Exception as e:
|
||
log.warning("camera_api unavailable — frame streaming disabled: %s", e)
|
||
_camera_get_frame = None
|
||
if _camera_get_frame is not None:
|
||
def _frame_sender_loop():
|
||
log.info(
|
||
"frame sender started — %.2fs interval, "
|
||
"streaming camera frames to Gemini Live",
|
||
frame_interval,
|
||
)
|
||
while not frame_stop.is_set() and self._running:
|
||
try:
|
||
frame_b64 = _camera_get_frame()
|
||
if frame_b64:
|
||
# camera_api returns a base64 ASCII string —
|
||
# GeminiBrain.send_frame accepts that directly.
|
||
brain.send_frame(frame_b64)
|
||
except Exception as e:
|
||
log.debug("frame send failed: %s", e)
|
||
frame_stop.wait(frame_interval)
|
||
log.info("frame sender stopped")
|
||
|
||
frame_thread = threading.Thread(
|
||
target=_frame_sender_loop,
|
||
daemon=True, name="gemini-frames",
|
||
)
|
||
frame_thread.start()
|
||
|
||
try:
|
||
while self._running:
|
||
time.sleep(0.25)
|
||
finally:
|
||
frame_stop.set()
|
||
if frame_thread is not None:
|
||
frame_thread.join(timeout=2)
|
||
brain.stop()
|
||
self._brain = None
|
||
|
||
# ─── dispatch side channel ────────────────────────────
|
||
|
||
def _on_gemini_transcript(self, text: str) -> None:
|
||
"""Log every user transcript to logs/transcript.log."""
|
||
if text:
|
||
_log_transcript("HEARD", text)
|
||
|
||
def _on_gemini_say_chunk(self, text: str) -> None:
|
||
"""
|
||
Receive a Gemini output-transcription chunk. Two side effects:
|
||
1. Forward to the bot dispatcher so motion can fire on
|
||
confirmation phrases (Turning right / Sitting down / etc.).
|
||
2. Buffer the chunk for the per-turn `[Sanad] said: ...` line
|
||
that prints once on turn_end (or sooner if the chunk ends
|
||
with sentence punctuation).
|
||
"""
|
||
# Motion side-channel — chunk-level so dispatch is fast.
|
||
try:
|
||
self._dispatch_gemini_bot(text)
|
||
except Exception:
|
||
pass
|
||
|
||
with self._gemini_say_lock:
|
||
self._gemini_say_buf.append(text)
|
||
self._gemini_say_last_chunk_at = time.time()
|
||
# Flush early if this chunk closes a sentence — typical for
|
||
# short acks like "Turning right." that arrive as one chunk.
|
||
if text.rstrip().endswith((".", "!", "?")):
|
||
self._flush_gemini_say_locked()
|
||
|
||
def _on_gemini_turn_end(self) -> None:
|
||
"""Flush any pending Gemini output chunks at turn boundary, and
|
||
reset the per-turn motion dedup set so the next user→Gemini
|
||
exchange starts fresh."""
|
||
with self._gemini_say_lock:
|
||
self._flush_gemini_say_locked()
|
||
self._fired_canons_this_turn.clear()
|
||
|
||
def _flush_gemini_say_locked(self) -> None:
|
||
"""Caller MUST hold self._gemini_say_lock. Prints one [Sanad] said: line."""
|
||
if not self._gemini_say_buf:
|
||
return
|
||
joined = " ".join(t.strip() for t in self._gemini_say_buf if t).strip()
|
||
while " " in joined:
|
||
joined = joined.replace(" ", " ")
|
||
self._gemini_say_buf = []
|
||
if joined:
|
||
_log_transcript("SAID", joined)
|
||
try:
|
||
print(f' [Sanad] said: "{joined[:200]}"')
|
||
print("Command: ", end="", flush=True)
|
||
except Exception:
|
||
pass
|
||
|
||
def flush_mic(self) -> None:
|
||
"""
|
||
Tell the Gemini runner subprocess to drop its mic buffer. Called
|
||
before AND after `audio_api.speak()` so the robot's own voice
|
||
(picked up by the mic during TtsMaker playback) doesn't come back
|
||
from Gemini as a fake user utterance.
|
||
No-op if the runner hasn't started yet.
|
||
"""
|
||
b = getattr(self, "_brain", None)
|
||
if b is None:
|
||
return
|
||
try:
|
||
b.flush_mic()
|
||
except Exception:
|
||
pass
|
||
|
||
def _normalize_command(self, text: str) -> str:
|
||
"""Fuzzy-match a transcription to the closest canonical phrase."""
|
||
canonical = _closest_command(text, cutoff=self._vocab_cutoff)
|
||
if canonical != text:
|
||
log.info("fuzzy-match: %r → %r", text, canonical)
|
||
return canonical
|
||
|
||
# _BOT_MOTION_PATTERNS is built at module load from
|
||
# Config/instruction.json::actions[*].bot_phrases (both English and
|
||
# Arabic). The dispatcher reads it via the module-level reference.
|
||
@property
|
||
def _BOT_MOTION_PATTERNS(self):
|
||
return _BOT_MOTION_PATTERNS
|
||
|
||
# _PARAMETRIC_ACTIONS holds the regex/template tuples for motion
|
||
# confirmations that carry a number ('Turning 360 degrees.',
|
||
# 'Walking 5 steps.'). Built from instruction.json::parametric_actions.
|
||
@property
|
||
def _PARAMETRIC_ACTIONS(self):
|
||
return _PARAMETRIC_ACTIONS
|
||
|
||
def _dispatch_gemini_bot(self, text: str) -> None:
|
||
"""
|
||
Dispatch motion when Gemini's spoken reply contains motion-
|
||
confirmation patterns (English or Arabic). The wake-word gate
|
||
lives INSIDE Gemini (see config_Voice.json::gemini_system_prompt) —
|
||
if Gemini speaks a motion phrase, that is its own decision after
|
||
evaluating whether the user said "Sanad"/"سند". Marcus trusts it
|
||
and dispatches the motion(s).
|
||
|
||
COMPOUND-COMMAND HANDLING: a single chunk may contain multiple
|
||
motion phrases — e.g. "Turning right, then moving forward." We
|
||
scan for ALL non-overlapping motion patterns in the chunk and
|
||
fire each unique canonical in spoken order. Three guards:
|
||
1. Longest-needle-first sort + span-claiming on the pattern
|
||
list rejects overlaps so 'moving forward' wins over the
|
||
shorter prefix 'moving' inside the same span.
|
||
2. _fired_canons_this_turn ensures each canonical fires AT
|
||
MOST ONCE per Gemini turn (compound replies often repeat
|
||
the same verb across chunks); cleared on turn_end.
|
||
3. command_cooldown_sec is a final safety net for runaway
|
||
same-canon dispatch across a turn_end gap.
|
||
"""
|
||
if not text:
|
||
return
|
||
|
||
# .lower() preserves character-by-character indexing for both
|
||
# ASCII and Arabic, so positions in `low` correspond to positions
|
||
# in `text` for ordering. Arabic patterns are stored as-is and
|
||
# unaffected by lowercasing.
|
||
low = text.lower()
|
||
if not low.strip():
|
||
return
|
||
|
||
# Walk every pattern, collect all (start_index, command_string,
|
||
# label) matches that don't overlap an already-claimed span.
|
||
# Two pattern families:
|
||
# - PARAMETRIC (regex with capture groups, e.g. 'turning 90
|
||
# degrees') — scanned FIRST because they're more specific
|
||
# than the bare-canonical match 'turn right'. The dispatched
|
||
# string is the formatted command_template ('turn left 90
|
||
# degrees'), which Brain/command_parser.py parses natively.
|
||
# - FIXED CANONICALS (substring needles from bot_phrases)
|
||
# — scanned second; pre-sorted longest-first so multi-word
|
||
# phrases claim spans before shorter prefixes.
|
||
# Per-turn dedup (_fired_canons_this_turn) is keyed by the
|
||
# dispatched command string, so the same parametric command
|
||
# ('turn 90 degrees') only fires once per Gemini reply.
|
||
matches = [] # type: list[tuple[int, str, str]]
|
||
claimed_spans = [] # type: list[tuple[int, int]]
|
||
|
||
for compiled, template in self._PARAMETRIC_ACTIONS:
|
||
for m in compiled.finditer(low):
|
||
j, end = m.start(), m.end()
|
||
if any(not (end <= s or j >= e)
|
||
for (s, e) in claimed_spans):
|
||
continue
|
||
cmd_text = _format_param_template(template, m.groups())
|
||
if not cmd_text:
|
||
continue
|
||
matches.append((j, cmd_text, "param"))
|
||
claimed_spans.append((j, end))
|
||
|
||
for needle, cmd in self._BOT_MOTION_PATTERNS:
|
||
if not needle:
|
||
continue
|
||
i = 0
|
||
while True:
|
||
j = low.find(needle, i)
|
||
if j == -1:
|
||
break
|
||
end = j + len(needle)
|
||
if any(not (end <= s or j >= e)
|
||
for (s, e) in claimed_spans):
|
||
i = j + 1
|
||
continue
|
||
matches.append((j, cmd, "canon"))
|
||
claimed_spans.append((j, end))
|
||
i = end
|
||
|
||
if not matches:
|
||
return
|
||
|
||
# Spoken order — sort by position so we fire turn_right BEFORE
|
||
# move_forward when Gemini said "Turning right, then moving
|
||
# forward."
|
||
matches.sort(key=lambda m: m[0])
|
||
|
||
now = time.time()
|
||
cooldown = float(self._stt.get("command_cooldown_sec", 1.5))
|
||
for _, cmd, label in matches:
|
||
if cmd in self._fired_canons_this_turn:
|
||
continue
|
||
if (cmd == self._last_gemini_canon
|
||
and now - self._last_gemini_dispatch_at < cooldown):
|
||
continue
|
||
self._fired_canons_this_turn.add(cmd)
|
||
self._last_gemini_canon = cmd
|
||
self._last_gemini_dispatch_at = now
|
||
|
||
log.info(
|
||
"dispatch (gemini-bot, %s): %s (heard: %r)",
|
||
label, cmd, text[:80],
|
||
)
|
||
_log_transcript("CMD-BOT", cmd)
|
||
if self._on_command:
|
||
try:
|
||
self._on_command(cmd, "en")
|
||
except Exception as e:
|
||
log.error("on_command error: %s", e, exc_info=True)
|
||
|
||
# ─── start / stop ─────────────────────────────────────
|
||
|
||
def start(self):
|
||
if self._running:
|
||
log.warning("VoiceModule already running")
|
||
return
|
||
self._running = True
|
||
self._thread = threading.Thread(
|
||
target=self._voice_loop, daemon=True, name="voice",
|
||
)
|
||
self._thread.start()
|
||
log.info("Voice module started")
|
||
|
||
def stop(self):
|
||
self._running = False
|
||
if self._thread:
|
||
self._thread.join(timeout=5)
|
||
self._thread = None
|
||
log.info("Voice module stopped")
|
||
|
||
@property
|
||
def is_running(self) -> bool:
|
||
"""True while the voice loop thread is alive."""
|
||
t = self._thread
|
||
return bool(self._running and t is not None and t.is_alive())
|
||
|
||
@property
|
||
def is_speaking(self) -> bool:
|
||
"""Delegates to AudioAPI — True while TtsMaker is playing."""
|
||
try:
|
||
return bool(self._audio.is_speaking)
|
||
except Exception:
|
||
return False
|