Marcus/Voice/marcus_voice.py

416 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Voice/marcus_voice.py — Gemini Live voice orchestrator for Marcus.
Pipeline:
G1 mic UDP ──► BuiltinMic (Voice/audio_io.py)
GeminiBrain (Voice/gemini/script.py)
│ audio out (24 kHz)
BuiltinSpeaker (Voice/audio_io.py) ──► G1 speaker
│ user transcript (on_command)
_dispatch_gemini_command
- require wake word "Sanad"
- fuzzy-match command_vocab
- dedup within command_cooldown_sec
on_command(text, "en") ──► Marcus brain
Gemini owns both STT and TTS — it hears the user and replies with its own
voice. Marcus's on_command hook fires alongside Gemini's verbal reply so
motion commands (\"Sanad, turn right\") still move the robot body while
the conversation flows naturally.
Wake word is enforced at dispatch only — Gemini chats normally on all
speech; the robot moves only when \"Sanad\" + a recognised action phrase
appears in the transcript.
"""
from __future__ import annotations
import logging
import os
import re
import sys
import threading
import time
from difflib import SequenceMatcher
from logging.handlers import RotatingFileHandler
from typing import Callable, Optional
import numpy as np
_PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if _PROJECT_DIR not in sys.path:
sys.path.insert(0, _PROJECT_DIR)
from Core.env_loader import PROJECT_ROOT
from Core.config_loader import load_config
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
handlers=[
RotatingFileHandler(
os.path.join(LOG_DIR, "voice.log"),
maxBytes=5_000_000, backupCount=3, encoding="utf-8",
),
],
)
log = logging.getLogger("marcus_voice")
# ── Transcript log ─────────────────────────────────────────────
# Every user transcript Gemini emits is written here in a simple
# one-line-per-entry format. Rotates every 5 MB × 3 backups.
_TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log")
_transcript_log = logging.getLogger("transcript")
_transcript_log.setLevel(logging.INFO)
_transcript_log.propagate = False
if not _transcript_log.handlers:
_th = RotatingFileHandler(
_TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8",
)
_th.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
_transcript_log.addHandler(_th)
def _log_transcript(action: str, text: str) -> None:
_transcript_log.info("%-5s %s", action, (text or "").strip())
# Module-level vocabulary — populated from Config/config_Voice.json::stt.
# Used by the wake-word gate and the fuzzy-match command normalizer.
WAKE_WORDS: set = set()
COMMAND_VOCAB: list = []
GARBAGE_PATTERNS: set = set()
_MIN_TRANSCRIPTION_LENGTH: int = 3
def _has_wake_word(text: str) -> bool:
"""True if `text` contains any wake-word variant as a whole word."""
low = text.lower()
for w in WAKE_WORDS:
if re.search(r'\b' + re.escape(w) + r'\b', low):
return True
return False
def _strip_wake_word_once(text: str) -> str:
"""Single pass of wake-word stripping. Use via _strip_wake_word()."""
stripped = text.strip()
for w in WAKE_WORDS:
if re.fullmatch(rf'{re.escape(w)}[\s,.!?]*', stripped, re.IGNORECASE):
return ""
for w in sorted(WAKE_WORDS, key=len, reverse=True):
m = re.match(
rf'^\s*{re.escape(w)}\s*[,.!?]?\s+(.+)$',
text, re.IGNORECASE,
)
if m:
return m.group(1).strip(' ,.!?')
m = re.match(
rf'^(.+?)\s+{re.escape(w)}\s*[.!?]*\s*$',
text, re.IGNORECASE,
)
if m:
return m.group(1).strip(' ,.!?')
return text
def _strip_wake_word(text: str) -> str:
"""
Remove the wake word from the start or end of text, iteratively, so
repeated-wake transcriptions ("Sanad. Sanad.") fully collapse.
Capped at 5 passes to prevent pathological inputs from looping.
"""
for _ in range(5):
stripped = _strip_wake_word_once(text)
if stripped == text:
return text
text = stripped
return text
def _closest_command(text: str, cutoff: float = 0.72) -> str:
"""
Map a transcription to the closest known command phrase.
Returns the canonical command if there's a close-enough match, else
returns the original text unchanged.
"""
low = text.lower().strip().rstrip(".!?,")
if not low:
return text
for cmd in COMMAND_VOCAB:
if cmd in low:
return cmd
best_cmd = None
best_ratio = 0.0
for cmd in COMMAND_VOCAB:
r = SequenceMatcher(None, low, cmd).ratio()
if r > best_ratio:
best_ratio = r
best_cmd = cmd
if best_ratio >= cutoff:
return best_cmd
return text
class VoiceModule:
"""Thin orchestrator around GeminiBrain + command dispatch."""
def __init__(
self,
audio_api,
on_command: Optional[Callable] = None,
on_wake: Optional[Callable] = None,
):
self._audio = audio_api
self._on_command = on_command
self._on_wake = on_wake
self._config = load_config("Voice")
self._stt = self._config.get("stt", {})
self._messages = self._config.get("messages", {})
# Load vocab from config — single source of truth.
global WAKE_WORDS, COMMAND_VOCAB, GARBAGE_PATTERNS, _MIN_TRANSCRIPTION_LENGTH
WAKE_WORDS = {w.lower() for w in self._stt.get("wake_words", [])}
COMMAND_VOCAB = list(self._stt.get("command_vocab", []))
GARBAGE_PATTERNS = {p.lower() for p in self._stt.get("garbage_patterns", [])}
_MIN_TRANSCRIPTION_LENGTH = int(self._stt.get("min_transcription_length", 3))
self._vocab_cutoff = float(self._stt.get("command_vocab_cutoff", 0.72))
log.info(
"vocab loaded: %d wake_words, %d command_vocab, %d garbage_patterns",
len(WAKE_WORDS), len(COMMAND_VOCAB), len(GARBAGE_PATTERNS),
)
# Dispatch dedup state: Gemini's input_transcription can fire
# multiple times per turn (streaming partials). Track the last
# canonical command + timestamp so we don't move twice.
self._last_gemini_canon = ""
self._last_gemini_dispatch_at = 0.0
# Gemini brain reference for flush_mic() — populated by
# _voice_loop_gemini after spawning the runner subprocess.
self._brain = None
self._running = False
self._thread = None
log.info("VoiceModule initialized (backend=gemini)")
# ─── main loop ────────────────────────────────────────
def _voice_loop(self):
"""
Spawn the Gemini Live STT subprocess (runs in the gemini_sdk
Python 3.10+ env) and forward its transcripts into Marcus's
dispatch gate. Marcus's main process never opens the Gemini
WebSocket itself — google-genai needs Python ≥3.9 and marcus
is pinned to 3.8 by the Jetson torch wheel.
"""
api_key = (
os.environ.get("MARCUS_GEMINI_API_KEY")
or os.environ.get("SANAD_GEMINI_API_KEY")
or self._stt.get("gemini_api_key", "")
)
if not api_key:
log.error(
"No Gemini API key found. Set env MARCUS_GEMINI_API_KEY "
"or stt.gemini_api_key in Config/config_Voice.json"
)
while self._running:
time.sleep(0.5)
return
from Voice.gemini_script import GeminiBrain
# Env overrides for model + voice are passed through to the
# runner subprocess automatically (it reads the same env vars).
model = (
os.environ.get("MARCUS_GEMINI_MODEL")
or self._stt.get(
"gemini_model",
"gemini-2.5-flash-native-audio-preview-12-2025",
)
)
voice_name = (
os.environ.get("MARCUS_GEMINI_VOICE")
or self._stt.get("gemini_voice_name", "Charon")
)
# System prompt: the runner reads the same config & file paths,
# but we forward the resolved string in case marcus's config layer
# picked a fallback. Forwarded via env in GeminiBrain.start().
system_prompt = self._stt.get(
"gemini_system_prompt",
"Transcribe what the user says to Sanad. Stay silent.",
)
sp_file = self._stt.get("gemini_system_prompt_file", "")
if sp_file:
sp_path = sp_file if os.path.isabs(sp_file) else os.path.join(
PROJECT_ROOT, sp_file,
)
try:
with open(sp_path, "r", encoding="utf-8") as f:
loaded = f.read().strip()
if loaded:
system_prompt = loaded
log.info(
"gemini system prompt loaded from %s (%d chars)",
sp_path, len(loaded),
)
except Exception as e:
log.warning(
"gemini_system_prompt_file=%r unreadable: %s"
"using inline config", sp_file, e,
)
log.info(
"Voice loop started — GEMINI STT subprocess "
"(model=%s, voice=%s)", model, voice_name,
)
brain = GeminiBrain(
None, None, # audio_io, recorder owned by runner
voice_name=voice_name,
system_prompt=system_prompt,
api_key=api_key,
on_transcript=self._on_gemini_transcript,
on_command=self._dispatch_gemini_command,
)
self._brain = brain
brain.start()
try:
while self._running:
time.sleep(0.25)
finally:
brain.stop()
self._brain = None
# ─── dispatch side channel ────────────────────────────
def _on_gemini_transcript(self, text: str) -> None:
"""Log every user transcript to logs/transcript.log."""
if text:
_log_transcript("HEARD", text)
def _dispatch_gemini_command(self, text: str, lang: str = "en") -> None:
"""
Fire self._on_command for any transcript prefixed with the wake
word "Sanad". Marcus's brain is the authoritative decision maker
in the STT-only architecture — it handles motion AND Q&A AND
vision queries AND replies via TtsMaker.
The vocab-match gate has been dropped: if the user says
"Sanad, what's the weather" the transcript still reaches the
brain, which either answers via its VLM or declines. This keeps
all Gemini-heard queries routed through one place (Marcus) and
removes the audio collision that full S2S had.
Examples:
"Sanad, turn right" → strip → "turn right" → brain → motion
"Sanad, what do you see"→ strip → "what do you see" → brain → VLM
"Sanad" → bare wake → skip (no payload)
"turn right" → no wake word → skip (conversation gate)
Dedup: Gemini emits streaming partials; same normalized command
within command_cooldown_sec fires only once.
"""
if not text or not _has_wake_word(text):
return
stripped = _strip_wake_word(text)
if not stripped or len(stripped.strip()) < _MIN_TRANSCRIPTION_LENGTH:
return
low = stripped.lower().strip().rstrip(".!?,")
vocab_exact = {c.lower() for c in COMMAND_VOCAB}
if low in GARBAGE_PATTERNS and low not in vocab_exact:
return
# Fuzzy-normalize (maps "turn right up" → "turn right") if the
# transcript is close to a vocab entry — but unlike before, we
# forward everything that passed the wake-word gate, not just
# vocab hits. Marcus's command_parser + VLM handles the rest.
command = self._normalize_command(stripped)
canon = command.lower().strip().rstrip(".!?,")
now = time.time()
cooldown = float(self._stt.get("command_cooldown_sec", 1.5))
if (canon == self._last_gemini_canon
and now - self._last_gemini_dispatch_at < cooldown):
return
self._last_gemini_canon = canon
self._last_gemini_dispatch_at = now
log.info("dispatch (gemini): %s", command[:120])
_log_transcript("CMD", command)
if self._on_command:
try:
self._on_command(command, lang or "en")
except Exception as e:
log.error("on_command error: %s", e, exc_info=True)
def flush_mic(self) -> None:
"""
Tell the Gemini runner subprocess to drop its mic buffer. Called
before AND after `audio_api.speak()` so the robot's own voice
(picked up by the mic during TtsMaker playback) doesn't come back
from Gemini as a fake user utterance.
No-op if the runner hasn't started yet.
"""
b = getattr(self, "_brain", None)
if b is None:
return
try:
b.flush_mic()
except Exception:
pass
def _normalize_command(self, text: str) -> str:
"""Fuzzy-match a transcription to the closest canonical phrase."""
canonical = _closest_command(text, cutoff=self._vocab_cutoff)
if canonical != text:
log.info("fuzzy-match: %r%r", text, canonical)
return canonical
# ─── start / stop ─────────────────────────────────────
def start(self):
if self._running:
log.warning("VoiceModule already running")
return
self._running = True
self._thread = threading.Thread(
target=self._voice_loop, daemon=True, name="voice",
)
self._thread.start()
log.info("Voice module started")
def stop(self):
self._running = False
if self._thread:
self._thread.join(timeout=5)
self._thread = None
log.info("Voice module stopped")
@property
def is_speaking(self) -> bool:
"""Delegates to AudioAPI — True while TtsMaker is playing."""
try:
return bool(self._audio.is_speaking)
except Exception:
return False