416 lines
15 KiB
Python
416 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Voice/marcus_voice.py — Gemini Live voice orchestrator for Marcus.
|
||
|
||
Pipeline:
|
||
|
||
G1 mic UDP ──► BuiltinMic (Voice/audio_io.py)
|
||
│
|
||
▼
|
||
GeminiBrain (Voice/gemini/script.py)
|
||
│ audio out (24 kHz)
|
||
▼
|
||
BuiltinSpeaker (Voice/audio_io.py) ──► G1 speaker
|
||
│ user transcript (on_command)
|
||
▼
|
||
_dispatch_gemini_command
|
||
- require wake word "Sanad"
|
||
- fuzzy-match command_vocab
|
||
- dedup within command_cooldown_sec
|
||
│
|
||
▼
|
||
on_command(text, "en") ──► Marcus brain
|
||
|
||
Gemini owns both STT and TTS — it hears the user and replies with its own
|
||
voice. Marcus's on_command hook fires alongside Gemini's verbal reply so
|
||
motion commands (\"Sanad, turn right\") still move the robot body while
|
||
the conversation flows naturally.
|
||
|
||
Wake word is enforced at dispatch only — Gemini chats normally on all
|
||
speech; the robot moves only when \"Sanad\" + a recognised action phrase
|
||
appears in the transcript.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
import os
|
||
import re
|
||
import sys
|
||
import threading
|
||
import time
|
||
from difflib import SequenceMatcher
|
||
from logging.handlers import RotatingFileHandler
|
||
from typing import Callable, Optional
|
||
|
||
import numpy as np
|
||
|
||
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
if _PROJECT_DIR not in sys.path:
|
||
sys.path.insert(0, _PROJECT_DIR)
|
||
from Core.env_loader import PROJECT_ROOT
|
||
from Core.config_loader import load_config
|
||
|
||
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
||
os.makedirs(LOG_DIR, exist_ok=True)
|
||
|
||
logging.basicConfig(
|
||
level=logging.INFO,
|
||
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
||
handlers=[
|
||
RotatingFileHandler(
|
||
os.path.join(LOG_DIR, "voice.log"),
|
||
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
),
|
||
],
|
||
)
|
||
log = logging.getLogger("marcus_voice")
|
||
|
||
|
||
# ── Transcript log ─────────────────────────────────────────────
|
||
# Every user transcript Gemini emits is written here in a simple
|
||
# one-line-per-entry format. Rotates every 5 MB × 3 backups.
|
||
_TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log")
|
||
_transcript_log = logging.getLogger("transcript")
|
||
_transcript_log.setLevel(logging.INFO)
|
||
_transcript_log.propagate = False
|
||
if not _transcript_log.handlers:
|
||
_th = RotatingFileHandler(
|
||
_TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||
)
|
||
_th.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
|
||
_transcript_log.addHandler(_th)
|
||
|
||
|
||
def _log_transcript(action: str, text: str) -> None:
|
||
_transcript_log.info("%-5s %s", action, (text or "").strip())
|
||
|
||
|
||
# Module-level vocabulary — populated from Config/config_Voice.json::stt.
|
||
# Used by the wake-word gate and the fuzzy-match command normalizer.
|
||
WAKE_WORDS: set = set()
|
||
COMMAND_VOCAB: list = []
|
||
GARBAGE_PATTERNS: set = set()
|
||
_MIN_TRANSCRIPTION_LENGTH: int = 3
|
||
|
||
|
||
def _has_wake_word(text: str) -> bool:
|
||
"""True if `text` contains any wake-word variant as a whole word."""
|
||
low = text.lower()
|
||
for w in WAKE_WORDS:
|
||
if re.search(r'\b' + re.escape(w) + r'\b', low):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _strip_wake_word_once(text: str) -> str:
|
||
"""Single pass of wake-word stripping. Use via _strip_wake_word()."""
|
||
stripped = text.strip()
|
||
for w in WAKE_WORDS:
|
||
if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE):
|
||
return ""
|
||
for w in sorted(WAKE_WORDS, key=len, reverse=True):
|
||
m = re.match(
|
||
rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$',
|
||
text, re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return m.group(1).strip(' ,.!?')
|
||
m = re.match(
|
||
rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$',
|
||
text, re.IGNORECASE,
|
||
)
|
||
if m:
|
||
return m.group(1).strip(' ,.!?')
|
||
return text
|
||
|
||
|
||
def _strip_wake_word(text: str) -> str:
|
||
"""
|
||
Remove the wake word from the start or end of text, iteratively, so
|
||
repeated-wake transcriptions ("Sanad. Sanad.") fully collapse.
|
||
Capped at 5 passes to prevent pathological inputs from looping.
|
||
"""
|
||
for _ in range(5):
|
||
stripped = _strip_wake_word_once(text)
|
||
if stripped == text:
|
||
return text
|
||
text = stripped
|
||
return text
|
||
|
||
|
||
def _closest_command(text: str, cutoff: float = 0.72) -> str:
|
||
"""
|
||
Map a transcription to the closest known command phrase.
|
||
Returns the canonical command if there's a close-enough match, else
|
||
returns the original text unchanged.
|
||
"""
|
||
low = text.lower().strip().rstrip(".!?,")
|
||
if not low:
|
||
return text
|
||
|
||
for cmd in COMMAND_VOCAB:
|
||
if cmd in low:
|
||
return cmd
|
||
|
||
best_cmd = None
|
||
best_ratio = 0.0
|
||
for cmd in COMMAND_VOCAB:
|
||
r = SequenceMatcher(None, low, cmd).ratio()
|
||
if r > best_ratio:
|
||
best_ratio = r
|
||
best_cmd = cmd
|
||
|
||
if best_ratio >= cutoff:
|
||
return best_cmd
|
||
return text
|
||
|
||
|
||
class VoiceModule:
|
||
"""Thin orchestrator around GeminiBrain + command dispatch."""
|
||
|
||
def __init__(
|
||
self,
|
||
audio_api,
|
||
on_command: Optional[Callable] = None,
|
||
on_wake: Optional[Callable] = None,
|
||
):
|
||
self._audio = audio_api
|
||
self._on_command = on_command
|
||
self._on_wake = on_wake
|
||
|
||
self._config = load_config("Voice")
|
||
self._stt = self._config.get("stt", {})
|
||
self._messages = self._config.get("messages", {})
|
||
|
||
# Load vocab from config — single source of truth.
|
||
global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH
|
||
WAKE_WORDS = {w.lower() for w in self._stt.get("wake_words", [])}
|
||
COMMAND_VOCAB = list(self._stt.get("command_vocab", []))
|
||
GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])}
|
||
_MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3))
|
||
self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72))
|
||
log.info(
|
||
"vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns",
|
||
len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS),
|
||
)
|
||
|
||
# Dispatch dedup state: Gemini's input_transcription can fire
|
||
# multiple times per turn (streaming partials). Track the last
|
||
# canonical command + timestamp so we don't move twice.
|
||
self._last_gemini_canon = ""
|
||
self._last_gemini_dispatch_at = 0.0
|
||
|
||
# Gemini brain reference for flush_mic() — populated by
|
||
# _voice_loop_gemini after spawning the runner subprocess.
|
||
self._brain = None
|
||
|
||
self._running = False
|
||
self._thread = None
|
||
|
||
log.info("VoiceModule initialized (backend=gemini)")
|
||
|
||
# ─── main loop ────────────────────────────────────────
|
||
|
||
def _voice_loop(self):
|
||
"""
|
||
Spawn the Gemini Live STT subprocess (runs in the gemini_sdk
|
||
Python 3.10+ env) and forward its transcripts into Marcus's
|
||
dispatch gate. Marcus's main process never opens the Gemini
|
||
WebSocket itself — google-genai needs Python ≥3.9 and marcus
|
||
is pinned to 3.8 by the Jetson torch wheel.
|
||
"""
|
||
api_key = (
|
||
os.environ.get("MARCUS_GEMINI_API_KEY")
|
||
or os.environ.get("SANAD_GEMINI_API_KEY")
|
||
or self._stt.get("gemini_api_key", "")
|
||
)
|
||
if not api_key:
|
||
log.error(
|
||
"No Gemini API key found. Set env MARCUS_GEMINI_API_KEY "
|
||
"or stt.gemini_api_key in Config/config_Voice.json"
|
||
)
|
||
while self._running:
|
||
time.sleep(0.5)
|
||
return
|
||
|
||
from Voice.gemini_script import GeminiBrain
|
||
|
||
# Env overrides for model + voice are passed through to the
|
||
# runner subprocess automatically (it reads the same env vars).
|
||
model = (
|
||
os.environ.get("MARCUS_GEMINI_MODEL")
|
||
or self._stt.get(
|
||
"gemini_model",
|
||
"gemini-2.5-flash-native-audio-preview-12-2025",
|
||
)
|
||
)
|
||
voice_name = (
|
||
os.environ.get("MARCUS_GEMINI_VOICE")
|
||
or self._stt.get("gemini_voice_name", "Charon")
|
||
)
|
||
# System prompt: the runner reads the same config & file paths,
|
||
# but we forward the resolved string in case marcus's config layer
|
||
# picked a fallback. Forwarded via env in GeminiBrain.start().
|
||
system_prompt = self._stt.get(
|
||
"gemini_system_prompt",
|
||
"Transcribe what the user says to Sanad. Stay silent.",
|
||
)
|
||
sp_file = self._stt.get("gemini_system_prompt_file", "")
|
||
if sp_file:
|
||
sp_path = sp_file if os.path.isabs(sp_file) else os.path.join(
|
||
PROJECT_ROOT, sp_file,
|
||
)
|
||
try:
|
||
with open(sp_path, "r", encoding="utf-8") as f:
|
||
loaded = f.read().strip()
|
||
if loaded:
|
||
system_prompt = loaded
|
||
log.info(
|
||
"gemini system prompt loaded from %s (%d chars)",
|
||
sp_path, len(loaded),
|
||
)
|
||
except Exception as e:
|
||
log.warning(
|
||
"gemini_system_prompt_file=%r unreadable: %s — "
|
||
"using inline config", sp_file, e,
|
||
)
|
||
|
||
log.info(
|
||
"Voice loop started — GEMINI STT subprocess "
|
||
"(model=%s, voice=%s)", model, voice_name,
|
||
)
|
||
|
||
brain = GeminiBrain(
|
||
None, None, # audio_io, recorder owned by runner
|
||
voice_name=voice_name,
|
||
system_prompt=system_prompt,
|
||
api_key=api_key,
|
||
on_transcript=self._on_gemini_transcript,
|
||
on_command=self._dispatch_gemini_command,
|
||
)
|
||
self._brain = brain
|
||
brain.start()
|
||
|
||
try:
|
||
while self._running:
|
||
time.sleep(0.25)
|
||
finally:
|
||
brain.stop()
|
||
self._brain = None
|
||
|
||
# ─── dispatch side channel ────────────────────────────
|
||
|
||
def _on_gemini_transcript(self, text: str) -> None:
|
||
"""Log every user transcript to logs/transcript.log."""
|
||
if text:
|
||
_log_transcript("HEARD", text)
|
||
|
||
def _dispatch_gemini_command(self, text: str, lang: str = "en") -> None:
|
||
"""
|
||
Fire self._on_command for any transcript prefixed with the wake
|
||
word "Sanad". Marcus's brain is the authoritative decision maker
|
||
in the STT-only architecture — it handles motion AND Q&A AND
|
||
vision queries AND replies via TtsMaker.
|
||
|
||
The vocab-match gate has been dropped: if the user says
|
||
"Sanad, what's the weather" the transcript still reaches the
|
||
brain, which either answers via its VLM or declines. This keeps
|
||
all Gemini-heard queries routed through one place (Marcus) and
|
||
removes the audio collision that full S2S had.
|
||
|
||
Examples:
|
||
"Sanad, turn right" → strip → "turn right" → brain → motion
|
||
"Sanad, what do you see"→ strip → "what do you see" → brain → VLM
|
||
"Sanad" → bare wake → skip (no payload)
|
||
"turn right" → no wake word → skip (conversation gate)
|
||
|
||
Dedup: Gemini emits streaming partials; same normalized command
|
||
within command_cooldown_sec fires only once.
|
||
"""
|
||
if not text or not _has_wake_word(text):
|
||
return
|
||
|
||
stripped = _strip_wake_word(text)
|
||
if not stripped or len(stripped.strip()) < _MIN_TRANSCRIPTION_LENGTH:
|
||
return
|
||
|
||
low = stripped.lower().strip().rstrip(".!?,")
|
||
vocab_exact = {c.lower() for c in COMMAND_VOCAB}
|
||
if low in GARBAGE_PATTERNS and low not in vocab_exact:
|
||
return
|
||
|
||
# Fuzzy-normalize (maps "turn right up" → "turn right") if the
|
||
# transcript is close to a vocab entry — but unlike before, we
|
||
# forward everything that passed the wake-word gate, not just
|
||
# vocab hits. Marcus's command_parser + VLM handles the rest.
|
||
command = self._normalize_command(stripped)
|
||
canon = command.lower().strip().rstrip(".!?,")
|
||
|
||
now = time.time()
|
||
cooldown = float(self._stt.get("command_cooldown_sec", 1.5))
|
||
if (canon == self._last_gemini_canon
|
||
and now - self._last_gemini_dispatch_at < cooldown):
|
||
return
|
||
self._last_gemini_canon = canon
|
||
self._last_gemini_dispatch_at = now
|
||
|
||
log.info("dispatch (gemini): %s", command[:120])
|
||
_log_transcript("CMD", command)
|
||
if self._on_command:
|
||
try:
|
||
self._on_command(command, lang or "en")
|
||
except Exception as e:
|
||
log.error("on_command error: %s", e, exc_info=True)
|
||
|
||
def flush_mic(self) -> None:
|
||
"""
|
||
Tell the Gemini runner subprocess to drop its mic buffer. Called
|
||
before AND after `audio_api.speak()` so the robot's own voice
|
||
(picked up by the mic during TtsMaker playback) doesn't come back
|
||
from Gemini as a fake user utterance.
|
||
No-op if the runner hasn't started yet.
|
||
"""
|
||
b = getattr(self, "_brain", None)
|
||
if b is None:
|
||
return
|
||
try:
|
||
b.flush_mic()
|
||
except Exception:
|
||
pass
|
||
|
||
def _normalize_command(self, text: str) -> str:
|
||
"""Fuzzy-match a transcription to the closest canonical phrase."""
|
||
canonical = _closest_command(text, cutoff=self._vocab_cutoff)
|
||
if canonical != text:
|
||
log.info("fuzzy-match: %r → %r", text, canonical)
|
||
return canonical
|
||
|
||
# ─── start / stop ─────────────────────────────────────
|
||
|
||
def start(self):
|
||
if self._running:
|
||
log.warning("VoiceModule already running")
|
||
return
|
||
self._running = True
|
||
self._thread = threading.Thread(
|
||
target=self._voice_loop, daemon=True, name="voice",
|
||
)
|
||
self._thread.start()
|
||
log.info("Voice module started")
|
||
|
||
def stop(self):
|
||
self._running = False
|
||
if self._thread:
|
||
self._thread.join(timeout=5)
|
||
self._thread = None
|
||
log.info("Voice module stopped")
|
||
|
||
@property
|
||
def is_speaking(self) -> bool:
|
||
"""Delegates to AudioAPI — True while TtsMaker is playing."""
|
||
try:
|
||
return bool(self._audio.is_speaking)
|
||
except Exception:
|
||
return False
|