386 lines
12 KiB
Python
386 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Features/Voice/marcus_voice.py — Marcus Always-Listening Voice Module
|
|
======================================================================
|
|
State machine:
|
|
IDLE → (wake word detected) → WAKE_HEARD
|
|
WAKE_HEARD → (record command) → PROCESSING
|
|
PROCESSING → (Whisper transcribe) → send to brain → SPEAKING
|
|
SPEAKING → (TTS done) → IDLE
|
|
|
|
Wake word: "Marcus" / "ماركوس" (detected by Whisper tiny)
|
|
Commands: Transcribed by Whisper small
|
|
TTS: Handled by API/audio_api.py
|
|
|
|
Usage:
|
|
from Features.Voice.marcus_voice import VoiceModule
|
|
voice = VoiceModule(audio_api, on_command=brain.handle_voice_command)
|
|
voice.start() # background thread
|
|
voice.stop()
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
import threading
|
|
import time
|
|
import numpy as np
|
|
|
|
# ─── PATH CONFIG ─────────────────────────────────────────
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
BASE_PATH = os.environ.get("PROJECT_BASE", "/home/unitree")
|
|
PROJECT_NAME = "Marcus"
|
|
PROJECT_ROOT = os.path.join(BASE_PATH, PROJECT_NAME)
|
|
|
|
import json
|
|
|
|
LOG_DIR = os.path.join(PROJECT_ROOT, "logs")
|
|
os.makedirs(LOG_DIR, exist_ok=True)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
|
|
handlers=[
|
|
logging.FileHandler(os.path.join(LOG_DIR, "voice.log")),
|
|
logging.StreamHandler(),
|
|
],
|
|
)
|
|
log = logging.getLogger("marcus_voice")
|
|
|
|
|
|
def load_config(name: str) -> dict:
|
|
path = os.path.join(PROJECT_ROOT, "Config", f"config_{name}.json")
|
|
with open(path, "r") as f:
|
|
return json.load(f)
|
|
|
|
|
|
# ─── STATE ENUM ──────────────────────────────────────────
|
|
|
|
class State:
|
|
IDLE = "IDLE"
|
|
WAKE_HEARD = "WAKE_HEARD"
|
|
PROCESSING = "PROCESSING"
|
|
SPEAKING = "SPEAKING"
|
|
|
|
|
|
# ─── VOICE MODULE ────────────────────────────────────────
|
|
|
|
class VoiceModule:
|
|
"""Always-listening voice interface for Marcus."""
|
|
|
|
def __init__(self, audio_api, on_command=None):
|
|
"""
|
|
Args:
|
|
audio_api: AudioAPI instance (from API/audio_api.py)
|
|
on_command: callback(text: str, lang: str) — called when command is transcribed
|
|
"""
|
|
self._audio = audio_api
|
|
self._on_command = on_command
|
|
self._config = load_config("Voice")
|
|
|
|
self._stt = self._config["stt"]
|
|
self._mic = self._config["mic"]
|
|
|
|
# Whisper models — lazy loaded
|
|
self._wake_model = None
|
|
self._cmd_model = None
|
|
|
|
# Wake words
|
|
self._wake_en = [w.lower() for w in self._stt["wake_words_en"]]
|
|
self._wake_ar = self._stt["wake_words_ar"]
|
|
|
|
# State
|
|
self._state = State.IDLE
|
|
self._running = False
|
|
self._thread = None
|
|
self._lock = threading.Lock()
|
|
|
|
log.info("VoiceModule initialized")
|
|
|
|
# ─── MODEL LOADING ────────────────────────────────────
|
|
|
|
def _load_whisper(self):
|
|
"""Lazy-load Whisper models."""
|
|
import whisper
|
|
|
|
if self._wake_model is None:
|
|
log.info("Loading Whisper '%s' for wake word...", self._stt["wake_model"])
|
|
self._wake_model = whisper.load_model(self._stt["wake_model"])
|
|
log.info("Wake model ready")
|
|
|
|
if self._cmd_model is None:
|
|
log.info("Loading Whisper '%s' for commands...", self._stt["command_model"])
|
|
self._cmd_model = whisper.load_model(self._stt["command_model"])
|
|
log.info("Command model ready")
|
|
|
|
# ─── MIC RECORDING ────────────────────────────────────
|
|
|
|
def _record_chunk(self, seconds: float) -> np.ndarray:
|
|
"""Record audio chunk from mic via parec."""
|
|
source = self._mic["source_index"]
|
|
rate = str(self._mic["rate"])
|
|
|
|
proc = subprocess.Popen(
|
|
["parec", "-d", source,
|
|
"--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
|
|
stdout=subprocess.PIPE,
|
|
)
|
|
time.sleep(seconds)
|
|
proc.terminate()
|
|
raw = proc.stdout.read()
|
|
return np.frombuffer(raw, dtype=np.int16)
|
|
|
|
def _record_until_silence(self) -> np.ndarray:
|
|
"""Record until silence is detected or max duration reached."""
|
|
source = self._mic["source_index"]
|
|
rate = self._mic["rate"]
|
|
threshold = self._stt["silence_threshold"]
|
|
silence_dur = self._stt["silence_duration_sec"]
|
|
max_dur = self._stt["max_record_sec"]
|
|
|
|
chunk_sec = 0.5
|
|
chunk_samples = int(rate * chunk_sec)
|
|
silence_chunks_needed = int(silence_dur / chunk_sec)
|
|
max_chunks = int(max_dur / chunk_sec)
|
|
|
|
proc = subprocess.Popen(
|
|
["parec", "-d", source,
|
|
"--format=s16le", f"--rate={rate}", "--channels=1", "--raw"],
|
|
stdout=subprocess.PIPE,
|
|
)
|
|
|
|
all_audio = []
|
|
silence_count = 0
|
|
chunk_count = 0
|
|
|
|
try:
|
|
while chunk_count < max_chunks:
|
|
data = proc.stdout.read(chunk_samples * 2) # 2 bytes per sample
|
|
if not data:
|
|
break
|
|
|
|
chunk = np.frombuffer(data, dtype=np.int16)
|
|
all_audio.append(chunk)
|
|
chunk_count += 1
|
|
|
|
# Check for silence
|
|
rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2))
|
|
if rms < threshold:
|
|
silence_count += 1
|
|
else:
|
|
silence_count = 0
|
|
|
|
if silence_count >= silence_chunks_needed and chunk_count > 2:
|
|
log.info("Silence detected after %.1fs", chunk_count * chunk_sec)
|
|
break
|
|
finally:
|
|
proc.terminate()
|
|
proc.stdout.read() # drain
|
|
|
|
if all_audio:
|
|
return np.concatenate(all_audio)
|
|
return np.array([], dtype=np.int16)
|
|
|
|
# ─── TRANSCRIPTION ────────────────────────────────────
|
|
|
|
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
|
|
"""Transcribe audio using Whisper. Returns text."""
|
|
import whisper
|
|
|
|
# Convert int16 to float32 [-1, 1]
|
|
audio_f32 = audio.astype(np.float32) / 32768.0
|
|
|
|
# Whisper expects 16kHz
|
|
result = model.transcribe(
|
|
audio_f32,
|
|
language=self._stt["language"], # None = auto-detect
|
|
task=task,
|
|
fp16=False,
|
|
)
|
|
text = result["text"].strip()
|
|
detected_lang = result.get("language", "unknown")
|
|
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
|
|
return text
|
|
|
|
def _check_wake_word(self, text: str) -> bool:
|
|
"""Check if transcribed text contains a wake word."""
|
|
text_lower = text.lower().strip()
|
|
|
|
# English wake words
|
|
for w in self._wake_en:
|
|
if w in text_lower:
|
|
return True
|
|
|
|
# Arabic wake words
|
|
for w in self._wake_ar:
|
|
if w in text:
|
|
return True
|
|
|
|
return False
|
|
|
|
# ─── MAIN LOOP ────────────────────────────────────────
|
|
|
|
def _voice_loop(self):
|
|
"""Main voice processing loop — runs in background thread."""
|
|
self._load_whisper()
|
|
log.info("Voice loop started — listening for wake word...")
|
|
|
|
# Unmute mic once
|
|
subprocess.run(
|
|
["pactl", "set-source-mute", self._mic["source_index"], "0"],
|
|
capture_output=True,
|
|
)
|
|
subprocess.run(
|
|
["pactl", "set-source-volume", self._mic["source_index"], "100%"],
|
|
capture_output=True,
|
|
)
|
|
|
|
while self._running:
|
|
try:
|
|
if self._state == State.IDLE:
|
|
self._do_idle()
|
|
elif self._state == State.WAKE_HEARD:
|
|
self._do_wake_heard()
|
|
elif self._state == State.PROCESSING:
|
|
self._do_processing()
|
|
elif self._state == State.SPEAKING:
|
|
# Wait for any TTS to finish before returning to IDLE
|
|
while self._audio.is_speaking:
|
|
time.sleep(0.1)
|
|
self._state = State.IDLE
|
|
except Exception as e:
|
|
log.error("Voice loop error: %s", e, exc_info=True)
|
|
self._state = State.IDLE
|
|
time.sleep(1)
|
|
|
|
def _do_idle(self):
|
|
"""Listen for wake word in 2-second chunks."""
|
|
# Skip if robot is speaking — prevents self-listening
|
|
if self._audio.is_speaking:
|
|
time.sleep(0.2)
|
|
return
|
|
|
|
audio = self._record_chunk(2.0)
|
|
|
|
# Double-check speaking didn't start during recording
|
|
if self._audio.is_speaking:
|
|
return
|
|
|
|
# Skip if too quiet (no one talking)
|
|
if audio.std() < 100:
|
|
return
|
|
|
|
text = self._transcribe(audio, self._wake_model)
|
|
|
|
if self._check_wake_word(text):
|
|
log.info("Wake word detected!")
|
|
self._state = State.WAKE_HEARD
|
|
|
|
# Acknowledge
|
|
self._audio.speak(
|
|
self._config["messages"]["wake_heard"], "en"
|
|
)
|
|
|
|
def _do_wake_heard(self):
|
|
"""Record the command until silence."""
|
|
# Wait for "Listening..." TTS to finish before recording
|
|
while self._audio.is_speaking:
|
|
time.sleep(0.1)
|
|
|
|
log.info("Recording command...")
|
|
audio = self._record_until_silence()
|
|
|
|
if len(audio) < 4000: # < 0.25s at 16kHz
|
|
log.info("Too short, ignoring")
|
|
self._audio.speak(self._config["messages"]["no_speech"], "en")
|
|
self._state = State.IDLE
|
|
return
|
|
|
|
self._command_audio = audio
|
|
self._state = State.PROCESSING
|
|
|
|
def _do_processing(self):
|
|
"""Transcribe the command and send to brain."""
|
|
text = self._transcribe(self._command_audio, self._cmd_model)
|
|
self._command_audio = None
|
|
|
|
if not text or len(text.strip()) < 2:
|
|
log.info("Empty transcription")
|
|
self._audio.speak(self._config["messages"]["no_speech"], "en")
|
|
self._state = State.IDLE
|
|
return
|
|
|
|
# Detect language
|
|
lang = "ar" if any('\u0600' <= c <= '\u06FF' for c in text) else "en"
|
|
log.info("Command [%s]: %s", lang, text)
|
|
|
|
# Send to brain callback
|
|
if self._on_command:
|
|
try:
|
|
self._on_command(text, lang)
|
|
except Exception as e:
|
|
log.error("Brain callback error: %s", e)
|
|
|
|
self._state = State.IDLE
|
|
|
|
# ─── START / STOP ─────────────────────────────────────
|
|
|
|
def start(self):
|
|
"""Start voice listening in background thread."""
|
|
if self._running:
|
|
log.warning("Voice module already running")
|
|
return
|
|
|
|
self._running = True
|
|
self._state = State.IDLE
|
|
self._thread = threading.Thread(target=self._voice_loop, daemon=True, name="voice")
|
|
self._thread.start()
|
|
log.info("Voice module started")
|
|
|
|
def stop(self):
|
|
"""Stop voice listening."""
|
|
self._running = False
|
|
if self._thread:
|
|
self._thread.join(timeout=5)
|
|
self._thread = None
|
|
log.info("Voice module stopped")
|
|
|
|
@property
|
|
def state(self) -> str:
|
|
return self._state
|
|
|
|
@property
|
|
def is_running(self) -> bool:
|
|
return self._running
|
|
|
|
|
|
# ─── STANDALONE TEST ─────────────────────────────────────
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
sys.path.insert(0, PROJECT_ROOT)
|
|
from API.audio_api import AudioAPI
|
|
|
|
def on_command(text, lang):
|
|
print(f"\n{'='*50}")
|
|
print(f" COMMAND [{lang}]: {text}")
|
|
print(f"{'='*50}\n")
|
|
|
|
audio = AudioAPI()
|
|
voice = VoiceModule(audio, on_command=on_command)
|
|
|
|
print("Starting voice module... say 'Marcus' to wake.")
|
|
print("Press Ctrl+C to stop.\n")
|
|
|
|
voice.start()
|
|
|
|
try:
|
|
while voice.is_running:
|
|
time.sleep(0.5)
|
|
except KeyboardInterrupt:
|
|
print("\nStopping...")
|
|
voice.stop()
|
|
print("Done.")
|