Voice: finalise on faster-whisper + energy wake, remove Vosk
Full-day voice-stack refactor. Experiments run and reverted:
- Gemini Live HTTP microservice (Python 3.8 env incompat, latency)
- Vosk grammar STT (English lexicon can't decode 'Sanad'; big model
cold-load too slow on Jetson CPU)
Kept architecture:
- Voice/wake_detector.py — pure-numpy energy state machine with
adaptive baseline, burst-audio capture for post-hoc verify.
- Voice/marcus_voice.py — orchestrator with 3 modes
(wake_and_command / always_on / always_on_gated), hysteretic VAD,
pre-silence trim (300 ms pre-roll), DSP pipeline (DC remove,
80 Hz HPF, 0.97 pre-emphasis, peak-normalize), faster-whisper
base.en int8 with beam=8 + temperature fallback [0,0.2,0.4],
fuzzy-match canonicalisation, GARBAGE_PATTERNS + length filter,
/s-/ phonetic wake-verify, full-turn debug WAV recording.
Config-driven vocab (zero hardcoded strings in Python):
- stt.wake_words (33 variants of 'Sanad')
- stt.command_vocab (68 canonical phrases)
- stt.garbage_patterns (17 Whisper noise outputs)
- stt.min_transcription_length, stt.command_vocab_cutoff
Command parser widened (Brain/command_parser.py):
- _RE_SIMPLE_DIR — bare direction + verb+direction combos
('left', 'go back', 'move forward', 'step right', ...)
- _RE_STOP_SIMPLE — bare stop/halt/wait/pause/freeze/hold
- All motion constants sourced from config_Navigation.json
(move_map + step_duration_sec) via API/zmq_api.py; no more
hardcoded 0.3 / 2.0 magic numbers.
API/audio_api.py — _play_pcm now uses AudioClient.PlayStream with
automatic resampling to 16 kHz (matches Sanad's proven pattern).
Removed:
- Voice/vosk_stt.py (and all Vosk references in marcus_voice.py)
- Models/vosk-model-small-en-us-0.15/ (40 MB model + zip)
- All Vosk keys from Config/config_Voice.json
Documentation synced across README, Doc/architecture.md,
Doc/pipeline.md, Doc/functions.md, Doc/controlling.md,
Doc/MARCUS_API.md, Doc/environment.md changelog.
Known limitation: faster-whisper base.en on Jetson CPU + G1
far-field mic yields ~50% command-transcription accuracy due
to model capacity and mic reverberation. Wake + ack + recording
+ trim + Whisper + fuzzy + brain + motion all verified working
end-to-end. Future improvement path (unused): close-talking USB
mic via pactl_parec, or Gemini Live via HTTP microservice.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f45e12fae5
commit
5d839d4f4e
@ -211,56 +211,58 @@ class AudioAPI:
|
|||||||
|
|
||||||
# ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
|
# ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
|
||||||
|
|
||||||
def _play_pcm(self, audio_16k: np.ndarray) -> float:
|
def _play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
|
||||||
"""Play 16kHz mono int16 on G1 speaker. Returns duration."""
|
"""
|
||||||
|
Play mono int16 PCM on the G1 speaker.
|
||||||
|
|
||||||
|
`rate` is the sample rate of the incoming `audio`; we always
|
||||||
|
resample to self._target_rate (16 kHz) before sending because the
|
||||||
|
G1 speaker hardware only honors that rate — if you hand it 24 kHz
|
||||||
|
PCM, it plays ~1.5x too fast. This matches the Sanad pattern.
|
||||||
|
|
||||||
|
Uses AudioClient.PlayStream (the high-level API) with a fresh
|
||||||
|
stream_id + STOP_PLAY bracket on either side so a prior stream
|
||||||
|
can't blend into this one.
|
||||||
|
"""
|
||||||
if not self._sdk_available:
|
if not self._sdk_available:
|
||||||
log.warning("SDK not available, cannot play audio")
|
log.warning("SDK not available, cannot play audio")
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
from unitree_sdk2py.g1.audio.g1_audio_api import (
|
src_rate = int(rate) if rate else self._target_rate
|
||||||
ROBOT_API_ID_AUDIO_START_PLAY,
|
audio = self._resample(audio, src_rate) # → self._target_rate
|
||||||
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
if audio.size == 0:
|
||||||
)
|
return 0.0
|
||||||
|
|
||||||
|
from unitree_sdk2py.g1.audio.g1_audio_api import ROBOT_API_ID_AUDIO_STOP_PLAY
|
||||||
|
|
||||||
app_name = self._spk["app_name"]
|
app_name = self._spk["app_name"]
|
||||||
|
|
||||||
# Stop previous stream
|
# Stop any prior stream before opening a new one.
|
||||||
self._client._Call(
|
self._client._Call(
|
||||||
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
||||||
json.dumps({"app_name": app_name}),
|
json.dumps({"app_name": app_name}),
|
||||||
)
|
)
|
||||||
time.sleep(0.3)
|
time.sleep(0.15)
|
||||||
|
|
||||||
# Build params — unique stream_id every call
|
|
||||||
pcm = audio_16k.tobytes()
|
|
||||||
sid = f"s_{int(time.time() * 1000)}"
|
sid = f"s_{int(time.time() * 1000)}"
|
||||||
param = json.dumps({
|
self._client.PlayStream(app_name, sid, audio.tobytes())
|
||||||
"app_name": app_name,
|
|
||||||
"stream_id": sid,
|
|
||||||
"sample_rate": self._target_rate,
|
|
||||||
"channels": 1,
|
|
||||||
"bits_per_sample": 16,
|
|
||||||
})
|
|
||||||
|
|
||||||
# Single call — full buffer
|
duration = len(audio) / self._target_rate
|
||||||
self._client._CallRequestWithParamAndBin(
|
# Margin covers DDS buffer drain before STOP cuts playback short.
|
||||||
ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm)
|
time.sleep(duration + 0.3)
|
||||||
)
|
|
||||||
|
|
||||||
duration = len(audio_16k) / self._target_rate
|
|
||||||
time.sleep(duration + 0.5)
|
|
||||||
|
|
||||||
self._client._Call(
|
self._client._Call(
|
||||||
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
ROBOT_API_ID_AUDIO_STOP_PLAY,
|
||||||
json.dumps({"app_name": app_name}),
|
json.dumps({"app_name": app_name}),
|
||||||
)
|
)
|
||||||
|
|
||||||
log.info("Played: %.1fs", duration)
|
log.info("Played: %.1fs (src=%d Hz → hw=%d Hz)",
|
||||||
|
duration, src_rate, self._target_rate)
|
||||||
return duration
|
return duration
|
||||||
|
|
||||||
def play_pcm(self, audio_16k: np.ndarray) -> float:
|
def play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
|
||||||
"""Public wrapper for playing PCM audio."""
|
"""Public wrapper for playing PCM audio."""
|
||||||
return self._play_pcm(audio_16k)
|
return self._play_pcm(audio, rate=rate)
|
||||||
|
|
||||||
# ─── MIC RECORDING ───────────────────────────────────
|
# ─── MIC RECORDING ───────────────────────────────────
|
||||||
|
|
||||||
|
|||||||
@ -80,6 +80,11 @@ def send_cmd(cmd: str):
|
|||||||
_ensure_sock().send_string(json.dumps({"cmd": cmd}))
|
_ensure_sock().send_string(json.dumps({"cmd": cmd}))
|
||||||
|
|
||||||
|
|
||||||
# Load MOVE_MAP from navigation config (pure data, safe at import time)
|
# Load navigation constants from config (pure data, safe at import time).
|
||||||
|
# MOVE_MAP[direction] = (vx, vy, vyaw). STEP_DURATION_SEC is how long one
|
||||||
|
# "step" of a bare directional command lasts (2 s at default velocities
|
||||||
|
# ≈ 60 cm forward or 34° turn). Both live in config_Navigation.json so
|
||||||
|
# motion can be retuned without editing Python.
|
||||||
_nav = load_config("Navigation")
|
_nav = load_config("Navigation")
|
||||||
MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}
|
MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}
|
||||||
|
STEP_DURATION_SEC = float(_nav.get("step_duration_sec", 2.0))
|
||||||
|
|||||||
@ -4,7 +4,7 @@ Handles place memory, odometry, session recall, help, examples
|
|||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from API.zmq_api import send_vel, gradual_stop
|
from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_DURATION_SEC
|
||||||
from API.memory_api import mem, place_save, place_goto, places_list_str
|
from API.memory_api import mem, place_save, place_goto, places_list_str
|
||||||
from API.odometry_api import odom, ODOM_AVAILABLE
|
from API.odometry_api import odom, ODOM_AVAILABLE
|
||||||
from API.camera_api import get_frame
|
from API.camera_api import get_frame
|
||||||
@ -36,6 +36,30 @@ _RE_WALK_STEP = re.compile(
|
|||||||
r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I)
|
r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I)
|
||||||
_RE_TURN_STEP = re.compile(
|
_RE_TURN_STEP = re.compile(
|
||||||
r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I)
|
r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I)
|
||||||
|
|
||||||
|
# Simple one-shot motion — one word or verb+direction, no counts/units.
|
||||||
|
# All default to a ~2 s motion at the normal velocity. Kept local so the
|
||||||
|
# user doesn't eat a 5 s Qwen round-trip for a trivial "go back".
|
||||||
|
#
|
||||||
|
# Matches:
|
||||||
|
# "left" / "right" / "forward" / "back" / "backward"
|
||||||
|
# "go back" / "step back" / "move back" / "walk back" / "run back"
|
||||||
|
# "go forward" / "step forward" / "move forward" / "walk forward"
|
||||||
|
# "go left" / "move right" / "step left" / etc.
|
||||||
|
# "head forward" / "head back"
|
||||||
|
# Does NOT match multi-word phrases like "walk to the chair" — those
|
||||||
|
# still fall through to Qwen where they belong.
|
||||||
|
_RE_SIMPLE_DIR = re.compile(
|
||||||
|
r"^(?:(?:walk|go|move|step|run|head)\s+)?"
|
||||||
|
r"(forward|back(?:ward)?|left|right)$",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bare stop / pause words — no need to ask Qwen what "stop" means.
|
||||||
|
_RE_STOP_SIMPLE = re.compile(
|
||||||
|
r"^(?:stop|halt|wait|pause|stay|freeze|hold|stand\s+still|don'?t\s+move)$",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
_RE_PATROL_RT = re.compile(
|
_RE_PATROL_RT = re.compile(
|
||||||
r"^patrol[/:]\s*(.+)$", re.I)
|
r"^patrol[/:]\s*(.+)$", re.I)
|
||||||
_RE_LAST_CMD = re.compile(
|
_RE_LAST_CMD = re.compile(
|
||||||
@ -115,9 +139,10 @@ def try_local_command(cmd: str) -> bool:
|
|||||||
if odom:
|
if odom:
|
||||||
odom.walk_distance(meters)
|
odom.walk_distance(meters)
|
||||||
else:
|
else:
|
||||||
|
vx, _, _ = MOVE_MAP["forward"]
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
while time.time() - t0 < meters / 0.3:
|
while time.time() - t0 < meters / abs(vx):
|
||||||
send_vel(vx=0.3)
|
send_vel(vx=vx)
|
||||||
time.sleep(0.05)
|
time.sleep(0.05)
|
||||||
gradual_stop()
|
gradual_stop()
|
||||||
return True
|
return True
|
||||||
@ -128,9 +153,10 @@ def try_local_command(cmd: str) -> bool:
|
|||||||
if odom:
|
if odom:
|
||||||
odom.walk_distance(meters, direction="backward")
|
odom.walk_distance(meters, direction="backward")
|
||||||
else:
|
else:
|
||||||
|
vx, _, _ = MOVE_MAP["backward"]
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
while time.time() - t0 < meters / 0.2:
|
while time.time() - t0 < meters / abs(vx):
|
||||||
send_vel(vx=-0.2)
|
send_vel(vx=vx)
|
||||||
time.sleep(0.05)
|
time.sleep(0.05)
|
||||||
gradual_stop()
|
gradual_stop()
|
||||||
return True
|
return True
|
||||||
@ -144,9 +170,13 @@ def try_local_command(cmd: str) -> bool:
|
|||||||
if odom:
|
if odom:
|
||||||
odom.turn_degrees(degrees)
|
odom.turn_degrees(degrees)
|
||||||
else:
|
else:
|
||||||
|
# vyaw magnitude comes from MOVE_MAP["left"]; duration is
|
||||||
|
# abs(degrees)/(vyaw_deg_per_sec). vyaw in config is rad/s.
|
||||||
|
_, _, vyaw_mag = MOVE_MAP["left"]
|
||||||
|
vyaw_deg_per_sec = abs(vyaw_mag) * 180.0 / 3.14159265
|
||||||
|
vyaw = vyaw_mag if degrees > 0 else -vyaw_mag
|
||||||
|
duration = abs(degrees) / vyaw_deg_per_sec
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
vyaw = 0.3 if degrees > 0 else -0.3
|
|
||||||
duration = abs(degrees) / 17.2
|
|
||||||
while time.time() - t0 < duration:
|
while time.time() - t0 < duration:
|
||||||
send_vel(vyaw=vyaw)
|
send_vel(vyaw=vyaw)
|
||||||
time.sleep(0.05)
|
time.sleep(0.05)
|
||||||
@ -156,9 +186,11 @@ def try_local_command(cmd: str) -> bool:
|
|||||||
m = _RE_WALK_STEP.match(cmd)
|
m = _RE_WALK_STEP.match(cmd)
|
||||||
if m:
|
if m:
|
||||||
direction = (m.group(1) or "forward").lower()
|
direction = (m.group(1) or "forward").lower()
|
||||||
|
if direction.startswith("back"):
|
||||||
|
direction = "backward"
|
||||||
steps = int(m.group(2))
|
steps = int(m.group(2))
|
||||||
vx = -0.2 if direction.startswith("back") else 0.3
|
vx, _, _ = MOVE_MAP[direction]
|
||||||
duration = 2.0 * steps
|
duration = STEP_DURATION_SEC * steps
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
while time.time() - t0 < duration:
|
while time.time() - t0 < duration:
|
||||||
send_vel(vx=vx)
|
send_vel(vx=vx)
|
||||||
@ -170,8 +202,8 @@ def try_local_command(cmd: str) -> bool:
|
|||||||
if m:
|
if m:
|
||||||
direction = m.group(1).lower()
|
direction = m.group(1).lower()
|
||||||
steps = int(m.group(2)) if m.group(2) else 1
|
steps = int(m.group(2)) if m.group(2) else 1
|
||||||
vyaw = 0.3 if direction == "left" else -0.3
|
_, _, vyaw = MOVE_MAP[direction]
|
||||||
duration = 2.0 * steps
|
duration = STEP_DURATION_SEC * steps
|
||||||
t0 = time.time()
|
t0 = time.time()
|
||||||
while time.time() - t0 < duration:
|
while time.time() - t0 < duration:
|
||||||
send_vel(vyaw=vyaw)
|
send_vel(vyaw=vyaw)
|
||||||
@ -179,6 +211,31 @@ def try_local_command(cmd: str) -> bool:
|
|||||||
gradual_stop()
|
gradual_stop()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
# ── BARE / SIMPLE DIRECTIONAL COMMANDS ───────────────────────────────
|
||||||
|
# "left", "right", "forward", "back", "go back", "move forward",
|
||||||
|
# "step right", "walk back", "head forward" — any one-word direction
|
||||||
|
# or verb+direction with no explicit count. Duration and velocities
|
||||||
|
# come entirely from config_Navigation.json (MOVE_MAP +
|
||||||
|
# step_duration_sec) — no magic numbers here.
|
||||||
|
m = _RE_SIMPLE_DIR.match(cmd)
|
||||||
|
if m:
|
||||||
|
direction = m.group(1).lower()
|
||||||
|
if direction.startswith("back"):
|
||||||
|
direction = "backward"
|
||||||
|
vx, vy, vyaw = MOVE_MAP[direction]
|
||||||
|
t0 = time.time()
|
||||||
|
while time.time() - t0 < STEP_DURATION_SEC:
|
||||||
|
send_vel(vx=vx, vy=vy, vyaw=vyaw)
|
||||||
|
time.sleep(0.05)
|
||||||
|
gradual_stop()
|
||||||
|
return True
|
||||||
|
|
||||||
|
# ── BARE STOP / PAUSE ─────────────────────────────────────────────────
|
||||||
|
m = _RE_STOP_SIMPLE.match(cmd)
|
||||||
|
if m:
|
||||||
|
gradual_stop()
|
||||||
|
return True
|
||||||
|
|
||||||
# ── NAMED PATROL ROUTE ───────────────────────────────────────────────
|
# ── NAMED PATROL ROUTE ───────────────────────────────────────────────
|
||||||
m = _RE_PATROL_RT.match(cmd)
|
m = _RE_PATROL_RT.match(cmd)
|
||||||
if m:
|
if m:
|
||||||
@ -234,17 +291,11 @@ def try_local_command(cmd: str) -> bool:
|
|||||||
for phrase, (reverse_dir, _) in move_words.items():
|
for phrase, (reverse_dir, _) in move_words.items():
|
||||||
if phrase in cl:
|
if phrase in cl:
|
||||||
print(f" Undoing: '{c}' → reversing with '{reverse_dir}'")
|
print(f" Undoing: '{c}' → reversing with '{reverse_dir}'")
|
||||||
dur, t0 = 2.0, time.time()
|
vx, vy, vyaw = MOVE_MAP[reverse_dir]
|
||||||
if reverse_dir in ("left", "right"):
|
t0 = time.time()
|
||||||
vyaw = 0.3 if reverse_dir == "left" else -0.3
|
while time.time() - t0 < STEP_DURATION_SEC:
|
||||||
while time.time() - t0 < dur:
|
send_vel(vx=vx, vy=vy, vyaw=vyaw)
|
||||||
send_vel(vyaw=vyaw)
|
time.sleep(0.05)
|
||||||
time.sleep(0.05)
|
|
||||||
else:
|
|
||||||
vx = 0.3 if reverse_dir == "forward" else -0.2
|
|
||||||
while time.time() - t0 < dur:
|
|
||||||
send_vel(vx=vx)
|
|
||||||
time.sleep(0.05)
|
|
||||||
gradual_stop()
|
gradual_stop()
|
||||||
return True
|
return True
|
||||||
print(" No movement command to undo")
|
print(" No movement command to undo")
|
||||||
|
|||||||
@ -3,6 +3,8 @@
|
|||||||
"min_steps_before_check": 3,
|
"min_steps_before_check": 3,
|
||||||
"scan_interval_s": 0.4,
|
"scan_interval_s": 0.4,
|
||||||
"rotation_speed": 0.3,
|
"rotation_speed": 0.3,
|
||||||
|
"_step_duration_comment": "Duration of one 'step' for bare directional commands ('go back', 'turn right', etc.). 2.0s at move_map velocities ≈ 60 cm forward, 40 cm back, 34° turn. Change here and every regex fast-path in command_parser.py uses the new value.",
|
||||||
|
"step_duration_sec": 2.0,
|
||||||
"move_map": {
|
"move_map": {
|
||||||
"forward": [0.3, 0.0, 0.0],
|
"forward": [0.3, 0.0, 0.0],
|
||||||
"backward": [-0.2, 0.0, 0.0],
|
"backward": [-0.2, 0.0, 0.0],
|
||||||
|
|||||||
@ -5,14 +5,119 @@
|
|||||||
"target_sample_rate": 16000
|
"target_sample_rate": 16000
|
||||||
},
|
},
|
||||||
"stt": {
|
"stt": {
|
||||||
"backend": "custom_acoustic",
|
"backend": "faster_whisper",
|
||||||
"_comment": "Pure-DSP wake detector in Voice/wake_detector.py. No ML.",
|
"_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",
|
||||||
"speech_threshold": 150.0,
|
|
||||||
"min_word_duration": 0.20,
|
"_mode_comment": "Three modes: 'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic. 'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise). 'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).",
|
||||||
"max_word_duration": 1.50,
|
"mode": "wake_and_command",
|
||||||
"post_silence": 0.30,
|
|
||||||
"wake_cooldown": 1.50,
|
"_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
|
||||||
"wake_chunk_ms": 50
|
"always_on_speech_entry_rms": 150.0,
|
||||||
|
"always_on_silence_exit_rms": 70.0,
|
||||||
|
"always_on_silence_duration_sec": 0.8,
|
||||||
|
"always_on_min_utterance_sec": 0.3,
|
||||||
|
"always_on_max_utterance_sec": 12.0,
|
||||||
|
"always_on_idle_log_sec": 5.0,
|
||||||
|
"always_on_ambient_mult": 1.4,
|
||||||
|
"always_on_ambient_window_chunks": 100,
|
||||||
|
|
||||||
|
|
||||||
|
"whisper_model": "base.en",
|
||||||
|
"whisper_device": "cpu",
|
||||||
|
"whisper_compute_type": "int8",
|
||||||
|
|
||||||
|
"_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.",
|
||||||
|
"mic_gain": 1.0,
|
||||||
|
"whisper_beam_size": 8,
|
||||||
|
"whisper_no_speech_threshold": 0.85,
|
||||||
|
"whisper_log_prob_threshold": -1.8,
|
||||||
|
"whisper_compression_ratio_threshold": 3.0,
|
||||||
|
"whisper_temperature_fallback": [0.0, 0.2, 0.4],
|
||||||
|
"_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
|
||||||
|
"_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
|
||||||
|
"_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
|
||||||
|
"whisper_initial_prompt": "Robot voice command.",
|
||||||
|
|
||||||
|
"_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
|
||||||
|
"wake_words": [
|
||||||
|
"sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
|
||||||
|
"sanat", "sunnat", "sonnat", "sinnat", "sennat",
|
||||||
|
"sanid", "sanud", "saned", "sanod", "sanaad",
|
||||||
|
"senad", "sinad", "sonad", "sunad",
|
||||||
|
"sanah", "sanath", "sanadh", "sonadh",
|
||||||
|
"samad", "somad", "sumad",
|
||||||
|
"thanad", "zanad",
|
||||||
|
"sa nad", "san ad", "san odd", "san add"
|
||||||
|
],
|
||||||
|
"_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.",
|
||||||
|
"command_vocab": [
|
||||||
|
"what do you see", "what can you see", "look around",
|
||||||
|
"come to me", "come here", "come back", "come closer",
|
||||||
|
"approach", "get closer", "come",
|
||||||
|
"go home", "go back", "go forward", "go backward",
|
||||||
|
"go left", "go right", "go",
|
||||||
|
"sit down", "stand up", "sit", "stand",
|
||||||
|
"raise arm", "lower arm", "wave hello", "wave", "point",
|
||||||
|
"turn left", "turn right", "turn around",
|
||||||
|
"move forward", "move backward", "move back",
|
||||||
|
"move left", "move right",
|
||||||
|
"walk forward", "walk backward", "walk back",
|
||||||
|
"step forward", "step back", "step left", "step right",
|
||||||
|
"forward", "backward", "back", "left", "right",
|
||||||
|
"patrol", "stop", "halt", "wait", "pause", "freeze", "hold",
|
||||||
|
"hello", "hi", "hey", "help",
|
||||||
|
"who are you", "where are you", "where am i", "what is your name",
|
||||||
|
"remember this", "forget", "do it again", "repeat", "undo",
|
||||||
|
"follow me", "stay here"
|
||||||
|
],
|
||||||
|
"command_vocab_cutoff": 0.72,
|
||||||
|
"_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.",
|
||||||
|
"garbage_patterns": [
|
||||||
|
"thanks for watching", "thank you for watching",
|
||||||
|
"thank you", "thanks",
|
||||||
|
"bye", "goodbye",
|
||||||
|
".", "you", "yeah",
|
||||||
|
"okay", "ok",
|
||||||
|
"um", "uh", "hmm", "mm",
|
||||||
|
"i", "a"
|
||||||
|
],
|
||||||
|
"min_transcription_length": 3,
|
||||||
|
|
||||||
|
|
||||||
|
"_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
|
||||||
|
"speech_threshold": 400.0,
|
||||||
|
"min_word_duration": 0.25,
|
||||||
|
"max_word_duration": 2.50,
|
||||||
|
"post_silence": 0.20,
|
||||||
|
"wake_cooldown": 1.00,
|
||||||
|
"wake_chunk_ms": 50,
|
||||||
|
"wake_adaptive_window_n": 50,
|
||||||
|
"wake_adaptive_mult": 3.0,
|
||||||
|
"wake_diag_log_sec": 3.0,
|
||||||
|
|
||||||
|
"wake_ack": "tts",
|
||||||
|
"_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",
|
||||||
|
|
||||||
|
"_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.",
|
||||||
|
"wake_verify_enabled": false,
|
||||||
|
|
||||||
|
|
||||||
|
"_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
|
||||||
|
"speech_entry_rms": 400.0,
|
||||||
|
"silence_exit_rms": 200.0,
|
||||||
|
"_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.",
|
||||||
|
"silence_duration_sec": 0.6,
|
||||||
|
"max_record_sec": 5.0,
|
||||||
|
"min_record_sec": 0.4,
|
||||||
|
"ambient_probe_sec": 0.2,
|
||||||
|
"ambient_mult": 1.5,
|
||||||
|
"ambient_cap_rms": 200.0,
|
||||||
|
"_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.",
|
||||||
|
"recording_enabled": true,
|
||||||
|
"recording_keep_count": 50,
|
||||||
|
"command_cooldown_sec": 1.5,
|
||||||
|
"post_tts_settle_sec": 0.4,
|
||||||
|
"_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'."
|
||||||
},
|
},
|
||||||
"mic": {
|
"mic": {
|
||||||
"backend": "builtin_udp",
|
"backend": "builtin_udp",
|
||||||
|
|||||||
@ -25,6 +25,15 @@ main_prompt: |
|
|||||||
- speak: actually describe what you are doing OR what the camera shows right now. Do NOT copy example text. First person. English.
|
- speak: actually describe what you are doing OR what the camera shows right now. Do NOT copy example text. First person. English.
|
||||||
- abort: null normally; "obstacle detected" / "unsafe command" / "cannot comply" with actions=[] when unsafe.
|
- abort: null normally; "obstacle detected" / "unsafe command" / "cannot comply" with actions=[] when unsafe.
|
||||||
|
|
||||||
|
- CRITICAL — IF THE COMMAND IS UNCLEAR OR NOT AN ACTION:
|
||||||
|
If the input text is a single unclear word (like "I", "alright", "ok", "um"),
|
||||||
|
a random phrase ("I have a lot of beauty", "turn turn turn"), noise, a
|
||||||
|
greeting ("hello", "hi"), or anything that isn't clearly a movement /
|
||||||
|
arm / vision / memory instruction — DO NOT INVENT a command. Instead
|
||||||
|
reply with:
|
||||||
|
{{"actions":[],"arm":null,"speak":"Sorry, I didn't understand that — please repeat","abort":"command not understood"}}
|
||||||
|
Better to ask again than to guess and perform the wrong action.
|
||||||
|
|
||||||
Examples (learn the STRUCTURE, don't reuse the speak text):
|
Examples (learn the STRUCTURE, don't reuse the speak text):
|
||||||
"turn right" → {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}}
|
"turn right" → {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}}
|
||||||
"walk 2 steps" → {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Walking forward","abort":null}}
|
"walk 2 steps" → {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Walking forward","abort":null}}
|
||||||
|
|||||||
@ -29,7 +29,7 @@
|
|||||||
| Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
|
| Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
|
||||||
| Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
|
| Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
|
||||||
| Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
|
| Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
|
||||||
| Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. |
|
| Voice stack finalised | `Voice/marcus_voice.py`, `Voice/wake_detector.py` | Custom energy wake detector (pure numpy) + Whisper verify + faster-whisper command STT + fuzzy-match to canonical commands. Vosk experiment reverted; Gemini Live reverted. Single local STT engine. |
|
||||||
| Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
|
| Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
|
||||||
| Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
|
| Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
|
||||||
| Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |
|
| Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |
|
||||||
@ -766,9 +766,9 @@ SAFETY:
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 15. Voice API (mic + TTS + STT)
|
## 15. Voice API (mic + TTS + wake + STT)
|
||||||
|
|
||||||
New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack.
|
Current pipeline: G1 mic → custom energy wake detector → Whisper verify → TtsMaker "Yes" → record → faster-whisper transcribe → fuzzy-match canonical command → brain. Replaces all prior experiments (Gemini Live WebSocket, Vosk grammar, edge-tts / Piper).
|
||||||
|
|
||||||
### Mic — `Voice.builtin_mic.BuiltinMic`
|
### Mic — `Voice.builtin_mic.BuiltinMic`
|
||||||
|
|
||||||
@ -800,9 +800,36 @@ tts.speak("Hello, I am Sanad", block=True) # synth + play on G1 body speaker
|
|||||||
|
|
||||||
Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
|
Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
|
||||||
|
|
||||||
### Wake + command loop — `Voice.marcus_voice.VoiceModule`
|
### Wake detection — `Voice.wake_detector.WakeDetector`
|
||||||
|
|
||||||
Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands.
|
Pure-numpy energy state machine with adaptive noise floor. Classifies any 0.35-1.5 s speech burst as a candidate wake, captures the audio for post-hoc verification.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from Voice.wake_detector import WakeDetector, WakeConfig
|
||||||
|
cfg = WakeConfig(
|
||||||
|
sample_rate=16_000,
|
||||||
|
speech_threshold=400.0, # min RMS floor — above noise
|
||||||
|
min_word_duration_s=0.35, # filter out coughs (<0.35s)
|
||||||
|
max_word_duration_s=1.50, # filter out sentences
|
||||||
|
post_silence_s=0.30, # how long silence marks word end
|
||||||
|
cooldown_s=1.50, # min gap between fires
|
||||||
|
chunk_ms=50, # RMS analysis window
|
||||||
|
adaptive_window_n=50, # rolling mean of idle RMS
|
||||||
|
adaptive_mult=3.0, # effective = max(floor, baseline×mult)
|
||||||
|
)
|
||||||
|
det = WakeDetector(cfg)
|
||||||
|
while True:
|
||||||
|
pcm = mic.read_chunk(1024)
|
||||||
|
if det.process(pcm):
|
||||||
|
burst = det.get_last_burst() # audio that triggered wake
|
||||||
|
break
|
||||||
|
```
|
||||||
|
|
||||||
|
Config under `config_Voice.json::stt.{speech_threshold, min_word_duration, …}`.
|
||||||
|
|
||||||
|
### Voice orchestrator — `Voice.marcus_voice.VoiceModule`
|
||||||
|
|
||||||
|
Drives the full pipeline: wake detector → Whisper verify → record → transcribe → fuzzy-match → dispatch. Three operating modes (`wake_and_command`, `always_on`, `always_on_gated`) selectable via `stt.mode`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from API.audio_api import AudioAPI
|
from API.audio_api import AudioAPI
|
||||||
@ -818,7 +845,9 @@ voice.start() # background thread
|
|||||||
voice.stop()
|
voice.stop()
|
||||||
```
|
```
|
||||||
|
|
||||||
Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`.
|
Vocabulary (`wake_words`, `command_vocab`, `garbage_patterns`) is loaded from `config_Voice.json::stt.*` at `VoiceModule.__init__`. All thresholds, Whisper params, and mode selection live in the same config — no Python edits required to tune. See `Doc/controlling.md` → "Voice" for the tuning-knobs cheat sheet.
|
||||||
|
|
||||||
|
The brain's `_init_voice()` wires `on_command` to `process_command(text)` → `audio_api.speak(reply)`.
|
||||||
|
|
||||||
### AudioAPI — `API.audio_api.AudioAPI`
|
### AudioAPI — `API.audio_api.AudioAPI`
|
||||||
|
|
||||||
|
|||||||
@ -15,7 +15,7 @@
|
|||||||
- **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
|
- **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
|
||||||
- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic.
|
- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555` — `Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic.
|
||||||
- **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
|
- **G1 built-in TTS** via `client.TtsMaker()` — `Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
|
||||||
- **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`).
|
- **Voice stack finalised** — custom energy wake detector (`Voice/wake_detector.py`) + faster-whisper command STT (`Voice/marcus_voice.py`). Whisper verifies each acoustic wake before acking. Gemini voice module and Vosk grammar STT both tried and removed.
|
||||||
- **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
|
- **Subsystem flags** — `config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
|
||||||
- **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
|
- **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
|
||||||
- **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
|
- **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
|
||||||
@ -66,7 +66,7 @@ Marcus/
|
|||||||
│ ├── config_Memory.json # session/places paths
|
│ ├── config_Memory.json # session/places paths
|
||||||
│ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports
|
│ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports
|
||||||
│ ├── config_ImageSearch.json # search defaults
|
│ ├── config_ImageSearch.json # search defaults
|
||||||
│ ├── config_Voice.json # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port
|
│ ├── config_Voice.json # mic, TTS, wake detector thresholds, Whisper params, wake_words/command_vocab/garbage_patterns vocab lists, VAD thresholds
|
||||||
│ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params
|
│ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params
|
||||||
│ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify, 2× imgsearch)
|
│ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify, 2× imgsearch)
|
||||||
│ # Total: 12 JSON files + 1 YAML. (config_Memory.json removed 2026-04-21.)
|
│ # Total: 12 JSON files + 1 YAML. (config_Memory.json removed 2026-04-21.)
|
||||||
@ -83,10 +83,11 @@ Marcus/
|
|||||||
│ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
|
│ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
|
||||||
│ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
|
│ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
|
||||||
│
|
│
|
||||||
├── Voice/ # Mic + TTS + wake-word STT
|
├── Voice/ # Mic + TTS + wake detector + faster-whisper STT
|
||||||
│ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555
|
│ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555
|
||||||
│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id)
|
│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id)
|
||||||
│ └── marcus_voice.py # VoiceModule — Whisper tiny (wake) + small (command) state machine
|
│ ├── wake_detector.py # Pure-numpy energy wake detector (WakeDetector, WakeConfig) with adaptive baseline
|
||||||
|
│ └── marcus_voice.py # VoiceModule — orchestrates wake → verify → record → Whisper → dispatch
|
||||||
│
|
│
|
||||||
├── Brain/ # Decision logic — imports ONLY from API/
|
├── Brain/ # Decision logic — imports ONLY from API/
|
||||||
│ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal()
|
│ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal()
|
||||||
@ -188,7 +189,8 @@ Marcus/
|
|||||||
│ goal_nav.py │ │ builtin_mic.py │
|
│ goal_nav.py │ │ builtin_mic.py │
|
||||||
│ patrol.py │ │ builtin_tts.py │
|
│ patrol.py │ │ builtin_tts.py │
|
||||||
│ marcus_odometry.py │ │ marcus_voice.py │
|
│ marcus_odometry.py │ │ marcus_voice.py │
|
||||||
│ marcus_yolo.py │ │ (Whisper + TtsMaker) │
|
│ marcus_yolo.py │ │ wake_detector.py │
|
||||||
|
│ │ │ (Whisper + TtsMaker) │
|
||||||
│ marcus_imgsearch.py │ └──────────┬──────────────┘
|
│ marcus_imgsearch.py │ └──────────┬──────────────┘
|
||||||
└──────────────┬───────────┘ │
|
└──────────────┬───────────┘ │
|
||||||
│ │
|
│ │
|
||||||
@ -487,10 +489,10 @@ Supports text-only search (no reference image) using hint description.
|
|||||||
|
|
||||||
### Voice/
|
### Voice/
|
||||||
|
|
||||||
Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
|
Mic, TTS, energy-based wake detector, and faster-whisper STT pipeline. All files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
|
||||||
|
|
||||||
#### `builtin_mic.py` (~180 lines, new 2026-04-21)
|
#### `builtin_mic.py` (~180 lines)
|
||||||
Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer.
|
Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer. Identical pattern to `Project/Sanad/voice/audio_io.py::BuiltinMic`.
|
||||||
|
|
||||||
**Exports:**
|
**Exports:**
|
||||||
- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent)
|
- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent)
|
||||||
@ -499,20 +501,41 @@ Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-boa
|
|||||||
- `read_seconds(s)` — convenience for "record `s` seconds"
|
- `read_seconds(s)` — convenience for "record `s` seconds"
|
||||||
- `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
|
- `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
|
||||||
|
|
||||||
#### `builtin_tts.py` (~70 lines, new 2026-04-21)
|
#### `builtin_tts.py` (~70 lines)
|
||||||
Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone).
|
Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input.
|
||||||
|
|
||||||
**Exports:**
|
**Exports:**
|
||||||
- `BuiltinTTS(audio_client, default_speaker_id=0)` — init
|
- `BuiltinTTS(audio_client, default_speaker_id=0)` — init
|
||||||
- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
|
- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
|
||||||
|
|
||||||
#### `marcus_voice.py` (~340 lines, rewired 2026-04-21)
|
#### `wake_detector.py` (~240 lines)
|
||||||
Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()` → `BuiltinTTS`.
|
Pure-numpy energy-envelope state machine. Fires a wake event when it sees a short speech burst (0.2-1.5 s) sized to match a single spoken word like "Sanad", followed by a clear silence. No ML, no lexicon — just amplitude classification.
|
||||||
|
Adaptive noise-floor baseline: learns ambient RMS during idle, raises the effective threshold proportionally, so the detector works the same in a quiet room and a noisy lab. Captures the triggering burst audio (`get_last_burst()`) so callers can verify it was actually "Sanad" before acking. Exists because Vosk/Whisper both failed on the G1 far-field mic for short non-English proper nouns.
|
||||||
|
|
||||||
**Exports:**
|
**Exports:**
|
||||||
- `VoiceModule(audio_api, on_command=cb)` — init
|
- `WakeDetector(cfg)` with `WakeConfig(sample_rate, speech_threshold, min_word_duration_s, max_word_duration_s, post_silence_s, cooldown_s, chunk_ms, adaptive_window_n, adaptive_mult, diag_log_sec)`
|
||||||
- `start()` — spawn background thread
|
- `process(pcm_bytes) -> bool` — feed audio, returns True once per spoken "word"
|
||||||
- `stop()` — graceful teardown
|
- `reset()`, `get_last_burst() -> np.ndarray | None`
|
||||||
|
|
||||||
|
#### `marcus_voice.py` (~1000 lines)
|
||||||
|
Voice orchestrator. Reads from `BuiltinMic`, runs the `WakeDetector`, verifies the wake burst with a lightweight Whisper decode, records the command with hysteretic VAD (speech_entry / silence_exit thresholds, adaptive to measured ambient), trims leading silence before Whisper, transcribes with faster-whisper, fuzzy-matches against `command_vocab` to canonicalize near-misses ("Turn right up" → "turn right"), then dispatches to the brain callback.
|
||||||
|
|
||||||
|
Three operating modes selectable via `stt.mode`:
|
||||||
|
- `wake_and_command` (default): classic acoustic wake → TTS "Yes" → record → Whisper → brain
|
||||||
|
- `always_on`: no wake, transcribe every utterance, dispatch all
|
||||||
|
- `always_on_gated`: transcribe everything, only dispatch utterances containing "Sanad"
|
||||||
|
|
||||||
|
Wake verify rule: Whisper's decode must either contain a wake-word variant (`stt.wake_words`) OR start with `s/sh/z` — Whisper's consistent signature for mishearing "Sanad" as "Stop"/"Set"/"Sand". Pure silence / non-s speech is rejected silently.
|
||||||
|
|
||||||
|
**Module-level** (populated at `VoiceModule.__init__` from config):
|
||||||
|
- `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` — loaded from `config_Voice.json::stt.*`, single source of truth
|
||||||
|
- `_has_wake_word(text)`, `_strip_wake_word(text)` — iterative until stable, handles "Sanad. Sanad." → ""
|
||||||
|
- `_closest_command(text, cutoff)` — difflib fuzzy-match against `COMMAND_VOCAB`
|
||||||
|
|
||||||
|
**Exports:**
|
||||||
|
- `VoiceModule(audio_api, on_command=cb, on_wake=None)` — init
|
||||||
|
- `start()` / `stop()` — background thread lifecycle
|
||||||
|
- `is_running` property
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@ -79,16 +79,30 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
|
|||||||
|
|
||||||
## Voice
|
## Voice
|
||||||
|
|
||||||
- **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`)
|
- **Wake word:** "Sanad" (Whisper mishears it as "Stop", "Sand", "Set", "Send" — all accepted via the /s-/ phonetic rule; see `config_Voice.json::stt.wake_words` for the 33 fuzzy variants).
|
||||||
- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed.
|
- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed.
|
||||||
- **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally.
|
- **Wake detection:** custom energy-envelope state machine (pure numpy, no ML) — fires on any 0.35-1.5 s speech burst followed by silence. Adaptive to room ambient.
|
||||||
|
- **Wake verify:** lightweight Whisper decode on the triggering burst. Accepts if it contains a wake-word variant OR starts with `s`/`sh`/`z` (Whisper's consistent signature for "Sanad"). Rejects pure noise / non-s speech silently.
|
||||||
|
- **STT (command):** faster-whisper `base.en` int8 on CPU — loads ~1.5 s on first wake, cached after.
|
||||||
- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
|
- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
|
||||||
- **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command.
|
- **Barge-in:** the mic is muted during TTS playback, then flushed on return to listening.
|
||||||
|
|
||||||
Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker.
|
Interaction flow: say "Sanad" → hear *"Yes"* → speak your command → see transcript on console → Marcus answers through the speaker.
|
||||||
|
|
||||||
|
Three voice modes selectable via `config_Voice.json::stt.mode`:
|
||||||
|
- `wake_and_command` (default) — wake word required before each command
|
||||||
|
- `always_on` — continuously transcribe + dispatch every utterance
|
||||||
|
- `always_on_gated` — always listen + log, dispatch only if utterance contains "Sanad"
|
||||||
|
|
||||||
To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster.
|
To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster.
|
||||||
|
|
||||||
|
**Tuning knobs** (when false wakes or rejected real wakes) — all in `config_Voice.json::stt`:
|
||||||
|
- Too many false wakes from coughs/claps → raise `speech_threshold` or `min_word_duration`
|
||||||
|
- Real "Sanad" being rejected → check the log line `wake REJECTED — %r` to see what Whisper heard; widen `wake_words` if needed
|
||||||
|
- Commands transcribed wrong → check `whisper: lp=%.2f nsp=%.2f text=%r` log line; lower `whisper_no_speech_threshold` or tighten `whisper_log_prob_threshold`
|
||||||
|
- "I didn't catch that" on silence → raise `min_transcription_length`
|
||||||
|
- Latency too high → set `wake_ack: "none"` (skip "Yes" TTS, save ~1.7 s/cycle)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Command Reference
|
## Command Reference
|
||||||
@ -230,7 +244,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json:
|
|||||||
| `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
|
| `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
|
||||||
| Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
|
| Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
|
||||||
| `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast |
|
| `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~10–15 s on first Qwen load; subsequent commands are fast |
|
||||||
| Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` |
|
| Wake word never fires | Energy burst below floor, or Whisper verify rejecting | Check `logs/voice.log` — if you see `wake REJECTED — 'X'`, add X's root variant to `config_Voice.json::stt.wake_words`. If `baseline=0` persists, your ambient exceeds the floor — raise `speech_threshold`. |
|
||||||
| Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
|
| Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
|
||||||
| `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
|
| `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
|
||||||
| Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |
|
| Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |
|
||||||
@ -243,7 +257,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json:
|
|||||||
|------|------|
|
|------|------|
|
||||||
| Brain code | `~/Marcus/Brain/` |
|
| Brain code | `~/Marcus/Brain/` |
|
||||||
| Server | `~/Marcus/Server/marcus_server.py` |
|
| Server | `~/Marcus/Server/marcus_server.py` |
|
||||||
| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` |
|
| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,wake_detector,marcus_voice}.py` |
|
||||||
| Config | `~/Marcus/Config/` |
|
| Config | `~/Marcus/Config/` |
|
||||||
| Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
|
| Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
|
||||||
| YOLO model | `~/Marcus/Models/yolov8m.pt` |
|
| YOLO model | `~/Marcus/Models/yolov8m.pt` |
|
||||||
@ -261,7 +275,7 @@ See `Doc/functions.md` for the full function inventory (AST-generated).
|
|||||||
## Language policy
|
## Language policy
|
||||||
|
|
||||||
**English only.** Arabic was removed from the codebase on 2026-04-21:
|
**English only.** Arabic was removed from the codebase on 2026-04-21:
|
||||||
- `Config/config_Voice.json::stt.wake_words_en` — only English variants (`sanad`, `sannad`, `sanat`, `sunnat`)
|
- `Config/config_Voice.json::stt.wake_words` — English fuzzy variants only (33 entries), excludes common English words that would false-trigger (`said`, `sand`, `sunday`, etc.)
|
||||||
- `Config/marcus_prompts.yaml` — no Arabic examples left in any of the 7 prompts
|
- `Config/marcus_prompts.yaml` — no Arabic examples left in any of the 7 prompts
|
||||||
- `API/audio_api.py::speak(text)` — rejects non-ASCII (the G1 TtsMaker silently maps Arabic to Chinese, which nobody wants)
|
- `API/audio_api.py::speak(text)` — rejects non-ASCII (the G1 TtsMaker silently maps Arabic to Chinese, which nobody wants)
|
||||||
- `Brain/marcus_brain.py` — greeting and talk-pattern regexes match English only
|
- `Brain/marcus_brain.py` — greeting and talk-pattern regexes match English only
|
||||||
|
|||||||
@ -384,3 +384,5 @@ Config file (`Config/config_Vision.json`):
|
|||||||
| 2026-04-21 | **Subprocess leak fix**: `AudioAPI._record_parec` now wraps `Popen` in try/finally with `terminate → wait(1.0) → kill` fallback; orphan `parec` processes can no longer survive Ctrl-C. Last-resort `proc.kill()` catches only `OSError` (not bare `except`). |
|
| 2026-04-21 | **Subprocess leak fix**: `AudioAPI._record_parec` now wraps `Popen` in try/finally with `terminate → wait(1.0) → kill` fallback; orphan `parec` processes can no longer survive Ctrl-C. Last-resort `proc.kill()` catches only `OSError` (not bare `except`). |
|
||||||
| 2026-04-21 | **Modelfile corrected**: `Models/Modelfile` now `FROM qwen2.5vl:3b` (was `:7b`) with a header explaining it's an optional build template — runtime uses `ollama pull qwen2.5vl:3b` directly. |
|
| 2026-04-21 | **Modelfile corrected**: `Models/Modelfile` now `FROM qwen2.5vl:3b` (was `:7b`) with a header explaining it's an optional build template — runtime uses `ollama pull qwen2.5vl:3b` directly. |
|
||||||
| 2026-04-21 | **Final verification**: 14-dimension smoke test green — no Arabic, no dead dirs, 0 orphan keys, every FileHandler rotates, no bare `except: pass`, no stale `Models_marcus` / `marcus_llava` refs, 25/25 modules import. |
|
| 2026-04-21 | **Final verification**: 14-dimension smoke test green — no Arabic, no dead dirs, 0 orphan keys, every FileHandler rotates, no bare `except: pass`, no stale `Models_marcus` / `marcus_llava` refs, 25/25 modules import. |
|
||||||
|
| 2026-04-24 | **Voice finalised on faster-whisper + custom energy wake**. Added `Voice/wake_detector.py` (pure-numpy energy state machine, adaptive noise floor, burst-audio capture for verify). Rewrote `Voice/marcus_voice.py` around it: three operating modes (`wake_and_command` / `always_on` / `always_on_gated`), hysteretic record VAD, pre-speech silence trim (300 ms pre-roll preserved), faster-whisper `base.en` int8 CPU decode, fuzzy-match canonicalisation against `command_vocab`, `GARBAGE_PATTERNS` + length filter for noise hallucinations, `/s-/` phonetic wake verify (accepts Whisper mishearings of "Sanad" like "Stop"/"Set"/"Sand"). Tried and reverted: Gemini Live WebSocket (Python 3.8 incompatibility + latency), Vosk grammar STT (English lexicon can't decode "Sanad"; big model cold-load too slow on Jetson). All voice tunables (33 wake_words, 68 command_vocab, 17 garbage_patterns, ~25 threshold/VAD/Whisper keys) live in `config_Voice.json::stt.*` — zero hardcoded strings in Voice/. |
|
||||||
|
| 2026-04-24 | **Command parser widened**: `Brain/command_parser.py` now has `_RE_SIMPLE_DIR` (`left`, `go back`, `move forward`, `step right`, etc.) and `_RE_STOP_SIMPLE` (`stop`, `halt`, `wait`, `pause`, `freeze`) regex fast-paths — these bare-direction / bare-stop commands now skip Qwen entirely (~50 ms vs ~5 s). Motion velocities and step duration pulled from `config_Navigation.json::{move_map, step_duration_sec}` via `API/zmq_api.py`; command_parser no longer contains hardcoded `0.3` / `2.0` magic numbers. |
|
||||||
|
|||||||
@ -49,13 +49,14 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## `Voice/` — mic + TTS + STT
|
## `Voice/` — mic + TTS + wake + STT
|
||||||
|
|
||||||
| File | Public API |
|
| File | Public API |
|
||||||
|---|---|
|
|---|---|
|
||||||
| `builtin_mic.py` | `_find_g1_local_ip()` + **class `BuiltinMic`** |
|
| `builtin_mic.py` | `_find_g1_local_ip()` + **class `BuiltinMic`** |
|
||||||
| `builtin_tts.py` | **class `BuiltinTTS`** |
|
| `builtin_tts.py` | **class `BuiltinTTS`** |
|
||||||
| `marcus_voice.py` | **class `State`** (IDLE/WAKE_HEARD/PROCESSING/SPEAKING), **class `VoiceModule`** |
|
| `wake_detector.py` | **dataclass `WakeConfig`** + **class `WakeDetector`** |
|
||||||
|
| `marcus_voice.py` | module-level `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` (populated from config), helpers `_has_wake_word`, `_strip_wake_word`, `_strip_wake_word_once`, `_closest_command`, **class `VoiceModule`** |
|
||||||
|
|
||||||
**`Voice.builtin_mic.BuiltinMic`** — G1 UDP multicast mic:
|
**`Voice.builtin_mic.BuiltinMic`** — G1 UDP multicast mic:
|
||||||
`__init__(group, port, buf_max, read_timeout)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `read_seconds(seconds)`, `flush()`; internal `_recv_loop`.
|
`__init__(group, port, buf_max, read_timeout)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `read_seconds(seconds)`, `flush()`; internal `_recv_loop`.
|
||||||
@ -63,8 +64,11 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra
|
|||||||
**`Voice.builtin_tts.BuiltinTTS`** — wraps `AudioClient.TtsMaker`:
|
**`Voice.builtin_tts.BuiltinTTS`** — wraps `AudioClient.TtsMaker`:
|
||||||
`__init__(audio_client, default_speaker_id=0)`, `speak(text, speaker_id=None, block=True)`.
|
`__init__(audio_client, default_speaker_id=0)`, `speak(text, speaker_id=None, block=True)`.
|
||||||
|
|
||||||
**`Voice.marcus_voice.VoiceModule`** — Whisper wake + command STT:
|
**`Voice.wake_detector.WakeDetector`** — pure-numpy energy wake:
|
||||||
`__init__(audio_api, on_command)`, `start()`, `stop()`, props `state`, `is_running`. Internal state machine: `_do_idle`, `_do_wake_heard`, `_do_processing`; helpers `_load_whisper`, `_transcribe`, `_check_wake_word`, `_record_chunk`, `_record_until_silence`, `_voice_loop`.
|
`__init__(cfg: WakeConfig)`, `process(pcm_bytes) -> bool`, `reset()`, `get_last_burst() -> np.ndarray | None`. Internal: `_step(window)` state-machine per 50 ms analysis window; adaptive `_baseline_buf` rolling mean of idle-silence RMS; captures triggering burst audio for post-hoc Whisper verify.
|
||||||
|
|
||||||
|
**`Voice.marcus_voice.VoiceModule`** — voice orchestrator. Drives the wake detector, verifies each fire with a lightweight Whisper decode (wake-word substring OR /s-/ phonetic match), records commands with a hysteretic VAD, trims pre-speech silence, transcribes via faster-whisper, fuzzy-normalises near-misses to canonical commands, dispatches to brain.
|
||||||
|
`__init__(audio_api, on_command=None, on_wake=None)`, `start()`, `stop()`, `is_running` property. Internal: `_get_fw()` lazy faster-whisper loader, `_read_mic_raw` / `_read_mic_gained`, `_record_command()` with adaptive VAD + pre-silence trim, `_transcribe(audio)` Whisper decode + garbage filter, `_transcribe_command(audio)` thin wrapper, `_normalize_command(text)` fuzzy-match to `COMMAND_VOCAB`, `_handle_wake()` / `_voice_loop()` / `_voice_loop_wake()` / `_voice_loop_always_on(gated)`, `_save_unk_wav(audio)` for post-mortem debugging.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@ -51,22 +51,43 @@ G1 body mic (array)
|
|||||||
Voice/builtin_mic.py::BuiltinMic
|
Voice/builtin_mic.py::BuiltinMic
|
||||||
ring buffer (64 KB) + read_chunk(n)
|
ring buffer (64 KB) + read_chunk(n)
|
||||||
▼
|
▼
|
||||||
Voice/marcus_voice.py::VoiceModule (IDLE → WAKE_HEARD → PROCESSING → SPEAKING)
|
Voice/wake_detector.py::WakeDetector
|
||||||
├─ IDLE : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…)
|
pure-numpy energy state machine (SILENCE ⇄ SPEAKING)
|
||||||
├─ WAKE_HEARD : audio_api.speak("Listening") → G1 body speaker
|
adaptive noise floor: eff_threshold = max(speech_threshold, baseline × 3)
|
||||||
├─ PROCESSING : record-until-silence → Whisper small → transcribed text
|
fires on 0.35-1.5 s bursts followed by 0.3 s silence → captures burst audio
|
||||||
└─ on_command(text, "en")
|
▼
|
||||||
|
Voice/marcus_voice.py::VoiceModule._handle_wake()
|
||||||
|
├─ 1. Whisper verify on the burst audio:
|
||||||
|
│ text = faster-whisper(burst)
|
||||||
|
│ accept if _has_wake_word(text) OR startswith(s/sh/z)
|
||||||
|
│ reject otherwise (cough, clap, hello, okay) → silent return
|
||||||
|
├─ 2. audio_api.speak("Yes") → G1 body speaker (~1.5 s)
|
||||||
|
├─ 3. post_tts_settle_sec wait + mic flush
|
||||||
|
├─ 4. _record_command() — hysteretic VAD
|
||||||
|
│ speech_entry_rms / silence_exit_rms (adapt from wake baseline)
|
||||||
|
│ trim leading silence (keep 300 ms pre-roll) → tight clip for Whisper
|
||||||
|
├─ 5. _transcribe(audio)
|
||||||
|
│ faster-whisper (base.en int8 CPU)
|
||||||
|
│ beam_size=5, temperature=0, initial_prompt bias toward Sanad vocab
|
||||||
|
│ GARBAGE_PATTERNS + min_transcription_length reject noise hallucinations
|
||||||
|
├─ 6. _normalize_command(text)
|
||||||
|
│ difflib fuzzy-match vs stt.command_vocab
|
||||||
|
│ "Turn right up" → "turn right" (canonical form)
|
||||||
|
└─ 7. on_command(text, "en")
|
||||||
▼
|
▼
|
||||||
Brain/marcus_brain.py::process_command(text)
|
Brain/marcus_brain.py::process_command(text)
|
||||||
├─ regex fast-path → Brain/command_parser.py::try_local_command()
|
├─ regex fast-path → Brain/command_parser.py::try_local_command()
|
||||||
│ places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off
|
│ places · odometry walk/turn · patrol · session recall · goal_nav
|
||||||
|
│ + SIMPLE_DIR ("go back", "right", "forward") · STOP_SIMPLE ("stop", "halt")
|
||||||
|
│ + NAT_GOAL_RE (naturalised goals like "the chair") · auto on/off
|
||||||
|
│ (~50 ms when matched — NO LLM call)
|
||||||
└─ else → _handle_llava(text)
|
└─ else → _handle_llava(text)
|
||||||
├─ get_frame() (10×50 ms poll, no 1 s stall)
|
├─ get_frame() (10×50 ms poll, no 1 s stall)
|
||||||
├─ API/llava_api.py::ask(text, img)
|
├─ API/llava_api.py::ask(text, img)
|
||||||
│ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
|
│ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
|
||||||
│ → parse_json() → {actions, arm, speak, abort}
|
│ → parse_json() → {actions, arm, speak, abort}
|
||||||
└─ Brain/executor.py::execute(d)
|
└─ Brain/executor.py::execute(d)
|
||||||
├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma
|
├─ actions → MOVE_MAP[dir] → API/zmq_api.py::send_vel → Holosoma
|
||||||
├─ arm → API/arm_api.py (stub for now)
|
├─ arm → API/arm_api.py (stub for now)
|
||||||
└─ abort → gradual_stop()
|
└─ abort → gradual_stop()
|
||||||
▼
|
▼
|
||||||
@ -77,9 +98,17 @@ API/audio_api.py::speak(text, lang="en")
|
|||||||
├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
|
├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
|
||||||
│ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only
|
│ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only
|
||||||
│ time.sleep(len(text) * 0.08)
|
│ time.sleep(len(text) * 0.08)
|
||||||
└─ unmute mic → back to IDLE
|
└─ unmute mic → back to listening
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Config knobs** (all in `config_Voice.json::stt`):
|
||||||
|
- Wake: `speech_threshold` (floor), `min_word_duration`, `max_word_duration`, `post_silence`, `wake_cooldown`, `wake_adaptive_mult`, `wake_diag_log_sec`
|
||||||
|
- Verify: `wake_verify_enabled`
|
||||||
|
- Record: `speech_entry_rms`, `silence_exit_rms`, `silence_duration_sec`, `max_record_sec`, `min_record_sec`, `ambient_mult`, `ambient_cap_rms`
|
||||||
|
- Whisper: `whisper_model`, `whisper_compute_type`, `whisper_beam_size`, `whisper_no_speech_threshold`, `whisper_log_prob_threshold`, `whisper_initial_prompt`, `mic_gain`
|
||||||
|
- Vocab: `wake_words`, `command_vocab`, `garbage_patterns`, `command_vocab_cutoff`, `min_transcription_length`
|
||||||
|
- Mode: `mode` (`wake_and_command` | `always_on` | `always_on_gated`), `wake_ack` (`tts`|`none`)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Terminal / WebSocket command pipeline (same brain, skips voice)
|
## Terminal / WebSocket command pipeline (same brain, skips voice)
|
||||||
@ -169,7 +198,12 @@ Brain/command_parser.py — responds to "lidar status" queries
|
|||||||
| `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
|
| `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
|
||||||
| `mic_udp.read_timeout_sec` | config_Voice.json | `BuiltinMic.read_chunk` budget (default 0.04 s) |
|
| `mic_udp.read_timeout_sec` | config_Voice.json | `BuiltinMic.read_chunk` budget (default 0.04 s) |
|
||||||
| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) |
|
| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) |
|
||||||
| `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) |
|
| `stt.wake_words` | config_Voice.json | 33 fuzzy variants of "Sanad" for the wake-verify substring match |
|
||||||
|
| `stt.command_vocab` | config_Voice.json | 68 canonical command phrases for fuzzy-normalization (`"turn right up"` → `"turn right"`) |
|
||||||
|
| `stt.garbage_patterns` | config_Voice.json | 17 Whisper noise-hallucinations to reject (`"thanks for watching"`, `"okay"`, etc.) |
|
||||||
|
| `stt.speech_threshold` etc. | config_Voice.json | energy wake detector thresholds — see `Doc/controlling.md` "Voice" for the full tuning matrix |
|
||||||
|
| `stt.whisper_*` | config_Voice.json | faster-whisper model, compute type, beam size, confidence gates, bias prompt |
|
||||||
|
| `stt.mode` | config_Voice.json | `wake_and_command` (default) / `always_on` / `always_on_gated` |
|
||||||
| `timeout_ms`, `stale_threshold_s`, `reconnect_delay_s` | config_Camera.json | RealSense frame timeout, reconnect trigger, initial backoff |
|
| `timeout_ms`, `stale_threshold_s`, `reconnect_delay_s` | config_Camera.json | RealSense frame timeout, reconnect trigger, initial backoff |
|
||||||
| `default_max_steps`, `step_delay_s`, `rotate_speed`, `min_steps_warmup` | config_ImageSearch.json | image-guided search rotation cadence (wired into `Vision/marcus_imgsearch.py`) |
|
| `default_max_steps`, `step_delay_s`, `rotate_speed`, `min_steps_warmup` | config_ImageSearch.json | image-guided search rotation cadence (wired into `Vision/marcus_imgsearch.py`) |
|
||||||
| `default_walk_speed`, `dist_tolerance`, `angle_tolerance`, `safety_timeout_mult`, `dr_update_hz` | config_Odometry.json | precise motion control (wired into `Navigation/marcus_odometry.py`) |
|
| `default_walk_speed`, `dist_tolerance`, `angle_tolerance`, `safety_timeout_mult`, `dr_update_hz` | config_Odometry.json | precise motion control (wired into `Navigation/marcus_odometry.py`) |
|
||||||
@ -181,9 +215,14 @@ Brain/command_parser.py — responds to "lidar status" queries
|
|||||||
|
|
||||||
| Step | Typical | Notes |
|
| Step | Typical | Notes |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| Wake-word detect | 200–500 ms | Whisper tiny on 2 s chunk |
|
| Wake-word detect | <100 ms | pure-numpy energy detector, 50 ms analysis windows |
|
||||||
| Record until silence | 1–8 s | depends on user speech |
|
| Wake verify (first wake) | ~2000 ms | includes faster-whisper `base.en` cold load |
|
||||||
| Whisper small STT | 500–1500 ms | once per command |
|
| Wake verify (subsequent) | 300–700 ms | Whisper cached, decodes ~0.5-1.5 s burst |
|
||||||
|
| "Yes" TTS ack | ~1500 ms | G1 firmware `TtsMaker` minimum |
|
||||||
|
| Record until silence | 1–5 s | depends on user speech; `max_record_sec=5` cap |
|
||||||
|
| Pre-silence trim | <1 ms | numpy slice |
|
||||||
|
| faster-whisper STT | 500–1500 ms | `base.en` int8 on CPU, beam_size=5 |
|
||||||
|
| Fuzzy-match canonicalisation | <1 ms | difflib against 68 phrases |
|
||||||
| Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
|
| Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
|
||||||
| Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
|
| Ollama Qwen2.5-VL | 800–1500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
|
||||||
| Executor + ZMQ send | <10 ms | fire-and-forget PUB |
|
| Executor + ZMQ send | <10 ms | fire-and-forget PUB |
|
||||||
|
|||||||
Binary file not shown.
@ -1,9 +0,0 @@
|
|||||||
US English model for mobile Vosk applications
|
|
||||||
|
|
||||||
Copyright 2020 Alpha Cephei Inc
|
|
||||||
|
|
||||||
Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
|
|
||||||
Speed: 0.11xRT (desktop)
|
|
||||||
Latency: 0.15s (right context)
|
|
||||||
|
|
||||||
|
|
||||||
Binary file not shown.
@ -1,7 +0,0 @@
|
|||||||
--sample-frequency=16000
|
|
||||||
--use-energy=false
|
|
||||||
--num-mel-bins=40
|
|
||||||
--num-ceps=40
|
|
||||||
--low-freq=20
|
|
||||||
--high-freq=7600
|
|
||||||
--allow-downsample=true
|
|
||||||
@ -1,10 +0,0 @@
|
|||||||
--min-active=200
|
|
||||||
--max-active=3000
|
|
||||||
--beam=10.0
|
|
||||||
--lattice-beam=2.0
|
|
||||||
--acoustic-scale=1.0
|
|
||||||
--frame-subsampling-factor=3
|
|
||||||
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
|
|
||||||
--endpoint.rule2.min-trailing-silence=0.5
|
|
||||||
--endpoint.rule3.min-trailing-silence=0.75
|
|
||||||
--endpoint.rule4.min-trailing-silence=1.0
|
|
||||||
Binary file not shown.
Binary file not shown.
@ -1,17 +0,0 @@
|
|||||||
10015
|
|
||||||
10016
|
|
||||||
10017
|
|
||||||
10018
|
|
||||||
10019
|
|
||||||
10020
|
|
||||||
10021
|
|
||||||
10022
|
|
||||||
10023
|
|
||||||
10024
|
|
||||||
10025
|
|
||||||
10026
|
|
||||||
10027
|
|
||||||
10028
|
|
||||||
10029
|
|
||||||
10030
|
|
||||||
10031
|
|
||||||
@ -1,166 +0,0 @@
|
|||||||
1 nonword
|
|
||||||
2 begin
|
|
||||||
3 end
|
|
||||||
4 internal
|
|
||||||
5 singleton
|
|
||||||
6 nonword
|
|
||||||
7 begin
|
|
||||||
8 end
|
|
||||||
9 internal
|
|
||||||
10 singleton
|
|
||||||
11 begin
|
|
||||||
12 end
|
|
||||||
13 internal
|
|
||||||
14 singleton
|
|
||||||
15 begin
|
|
||||||
16 end
|
|
||||||
17 internal
|
|
||||||
18 singleton
|
|
||||||
19 begin
|
|
||||||
20 end
|
|
||||||
21 internal
|
|
||||||
22 singleton
|
|
||||||
23 begin
|
|
||||||
24 end
|
|
||||||
25 internal
|
|
||||||
26 singleton
|
|
||||||
27 begin
|
|
||||||
28 end
|
|
||||||
29 internal
|
|
||||||
30 singleton
|
|
||||||
31 begin
|
|
||||||
32 end
|
|
||||||
33 internal
|
|
||||||
34 singleton
|
|
||||||
35 begin
|
|
||||||
36 end
|
|
||||||
37 internal
|
|
||||||
38 singleton
|
|
||||||
39 begin
|
|
||||||
40 end
|
|
||||||
41 internal
|
|
||||||
42 singleton
|
|
||||||
43 begin
|
|
||||||
44 end
|
|
||||||
45 internal
|
|
||||||
46 singleton
|
|
||||||
47 begin
|
|
||||||
48 end
|
|
||||||
49 internal
|
|
||||||
50 singleton
|
|
||||||
51 begin
|
|
||||||
52 end
|
|
||||||
53 internal
|
|
||||||
54 singleton
|
|
||||||
55 begin
|
|
||||||
56 end
|
|
||||||
57 internal
|
|
||||||
58 singleton
|
|
||||||
59 begin
|
|
||||||
60 end
|
|
||||||
61 internal
|
|
||||||
62 singleton
|
|
||||||
63 begin
|
|
||||||
64 end
|
|
||||||
65 internal
|
|
||||||
66 singleton
|
|
||||||
67 begin
|
|
||||||
68 end
|
|
||||||
69 internal
|
|
||||||
70 singleton
|
|
||||||
71 begin
|
|
||||||
72 end
|
|
||||||
73 internal
|
|
||||||
74 singleton
|
|
||||||
75 begin
|
|
||||||
76 end
|
|
||||||
77 internal
|
|
||||||
78 singleton
|
|
||||||
79 begin
|
|
||||||
80 end
|
|
||||||
81 internal
|
|
||||||
82 singleton
|
|
||||||
83 begin
|
|
||||||
84 end
|
|
||||||
85 internal
|
|
||||||
86 singleton
|
|
||||||
87 begin
|
|
||||||
88 end
|
|
||||||
89 internal
|
|
||||||
90 singleton
|
|
||||||
91 begin
|
|
||||||
92 end
|
|
||||||
93 internal
|
|
||||||
94 singleton
|
|
||||||
95 begin
|
|
||||||
96 end
|
|
||||||
97 internal
|
|
||||||
98 singleton
|
|
||||||
99 begin
|
|
||||||
100 end
|
|
||||||
101 internal
|
|
||||||
102 singleton
|
|
||||||
103 begin
|
|
||||||
104 end
|
|
||||||
105 internal
|
|
||||||
106 singleton
|
|
||||||
107 begin
|
|
||||||
108 end
|
|
||||||
109 internal
|
|
||||||
110 singleton
|
|
||||||
111 begin
|
|
||||||
112 end
|
|
||||||
113 internal
|
|
||||||
114 singleton
|
|
||||||
115 begin
|
|
||||||
116 end
|
|
||||||
117 internal
|
|
||||||
118 singleton
|
|
||||||
119 begin
|
|
||||||
120 end
|
|
||||||
121 internal
|
|
||||||
122 singleton
|
|
||||||
123 begin
|
|
||||||
124 end
|
|
||||||
125 internal
|
|
||||||
126 singleton
|
|
||||||
127 begin
|
|
||||||
128 end
|
|
||||||
129 internal
|
|
||||||
130 singleton
|
|
||||||
131 begin
|
|
||||||
132 end
|
|
||||||
133 internal
|
|
||||||
134 singleton
|
|
||||||
135 begin
|
|
||||||
136 end
|
|
||||||
137 internal
|
|
||||||
138 singleton
|
|
||||||
139 begin
|
|
||||||
140 end
|
|
||||||
141 internal
|
|
||||||
142 singleton
|
|
||||||
143 begin
|
|
||||||
144 end
|
|
||||||
145 internal
|
|
||||||
146 singleton
|
|
||||||
147 begin
|
|
||||||
148 end
|
|
||||||
149 internal
|
|
||||||
150 singleton
|
|
||||||
151 begin
|
|
||||||
152 end
|
|
||||||
153 internal
|
|
||||||
154 singleton
|
|
||||||
155 begin
|
|
||||||
156 end
|
|
||||||
157 internal
|
|
||||||
158 singleton
|
|
||||||
159 begin
|
|
||||||
160 end
|
|
||||||
161 internal
|
|
||||||
162 singleton
|
|
||||||
163 begin
|
|
||||||
164 end
|
|
||||||
165 internal
|
|
||||||
166 singleton
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,3 +0,0 @@
|
|||||||
[
|
|
||||||
1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09
|
|
||||||
1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]
|
|
||||||
@ -1 +0,0 @@
|
|||||||
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
|
|
||||||
@ -1,2 +0,0 @@
|
|||||||
--left-context=3
|
|
||||||
--right-context=3
|
|
||||||
14
README.md
14
README.md
@ -25,7 +25,7 @@ a Python brain.
|
|||||||
| **Brain** (reason, speak, decide) | Parse commands, reason about vision, pick actions | **Qwen2.5-VL 3B** via Ollama | Jetson GPU |
|
| **Brain** (reason, speak, decide) | Parse commands, reason about vision, pick actions | **Qwen2.5-VL 3B** via Ollama | Jetson GPU |
|
||||||
| **Eyes** (see) | Real-time object/person detection | **YOLOv8m** (CUDA, FP16, 320 px, ~22 FPS) | Jetson GPU |
|
| **Eyes** (see) | Real-time object/person detection | **YOLOv8m** (CUDA, FP16, 320 px, ~22 FPS) | Jetson GPU |
|
||||||
| **Eyes** (understand) | Open-ended scene understanding, reading, goal-verify | **Qwen2.5-VL** (same brain model) | Jetson GPU |
|
| **Eyes** (understand) | Open-ended scene understanding, reading, goal-verify | **Qwen2.5-VL** (same brain model) | Jetson GPU |
|
||||||
| **Ears** (hear) | Always-on wake-word + command transcription | **Whisper tiny** (wake) + **Whisper small** (STT) | Jetson CPU/GPU |
|
| **Ears** (hear) | Energy-based wake detector + command transcription | **Custom DSP wake** (numpy, no ML) + **faster-whisper base.en int8** (STT) | Jetson CPU |
|
||||||
| **Mouth** (speak) | On-robot TTS, no internet needed | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker |
|
| **Mouth** (speak) | On-robot TTS, no internet needed | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker |
|
||||||
| **Legs** (walk) | 29-DoF locomotion + balance | **Holosoma** RL policy (separate process, ONNX) | Jetson CPU |
|
| **Legs** (walk) | 29-DoF locomotion + balance | **Holosoma** RL policy (separate process, ONNX) | Jetson CPU |
|
||||||
| **Hands** (gesture) | Arm & hand actions | **GR00T N1.5** — pending; `API/arm_api.py` is a stub today | Jetson GPU (future) |
|
| **Hands** (gesture) | Arm & hand actions | **GR00T N1.5** — pending; `API/arm_api.py` is a stub today | Jetson GPU (future) |
|
||||||
@ -54,7 +54,7 @@ Camera ─┘ ▼ ├─► Legs (Holosoma
|
|||||||
|
|
||||||
Three input modalities, same command loop:
|
Three input modalities, same command loop:
|
||||||
|
|
||||||
- **Voice** — say "**Sanad, what do you see?**" → wake word fires, Whisper transcribes, brain answers through the G1 speaker.
|
- **Voice** — say "**Sanad**" → energy detector fires, Whisper verifies the /sa-/ phoneme signature, robot replies "Yes" → speak your command → faster-whisper transcribes → brain answers through the G1 speaker.
|
||||||
- **Text** — type the same command into `run_marcus.py`'s terminal.
|
- **Text** — type the same command into `run_marcus.py`'s terminal.
|
||||||
- **WebSocket (remote)** — `Client/marcus_cli.py` or `Client/marcus_client.py` (Tkinter GUI) send commands from a workstation.
|
- **WebSocket (remote)** — `Client/marcus_cli.py` or `Client/marcus_client.py` (Tkinter GUI) send commands from a workstation.
|
||||||
|
|
||||||
@ -84,7 +84,8 @@ There are two schools for combining them:
|
|||||||
| Vision — open-ended scene understanding | same VLM | learned |
|
| Vision — open-ended scene understanding | same VLM | learned |
|
||||||
| Legs / locomotion | **RL policy** (Holosoma, ONNX) | learned |
|
| Legs / locomotion | **RL policy** (Holosoma, ONNX) | learned |
|
||||||
| Arms / gestures | SDK action-ID lookup | **hand-coded** |
|
| Arms / gestures | SDK action-ID lookup | **hand-coded** |
|
||||||
| Wake-word + STT | Whisper | learned |
|
| Wake word | Custom energy-envelope DSP (numpy) | hand-coded |
|
||||||
|
| STT (command) | faster-whisper base.en | learned |
|
||||||
| TTS | Unitree `TtsMaker` (on-robot DSP) | firmware |
|
| TTS | Unitree `TtsMaker` (on-robot DSP) | firmware |
|
||||||
| Glue between layers | Python + ZMQ + JSON | hand-coded |
|
| Glue between layers | Python + ZMQ + JSON | hand-coded |
|
||||||
|
|
||||||
@ -143,13 +144,13 @@ Same hardware, different prompts + wake word.
|
|||||||
|
|
||||||
- **Prompts** rewrite: *"You are a museum guide. When a visitor asks about an exhibit, describe it in two sentences and invite them to ask follow-ups."*
|
- **Prompts** rewrite: *"You are a museum guide. When a visitor asks about an exhibit, describe it in two sentences and invite them to ask follow-ups."*
|
||||||
- **Places** memory pre-loaded with exhibit waypoints; `patrol: exhibit_A → exhibit_B → exit` follows a tour.
|
- **Places** memory pre-loaded with exhibit waypoints; `patrol: exhibit_A → exhibit_B → exit` follows a tour.
|
||||||
- Wake word changed in `config_Voice.json::stt.wake_words_en`.
|
- Wake word variants in `config_Voice.json::stt.wake_words` (fuzzy list, handles Whisper mishearings of "Sanad").
|
||||||
- Image search (`search/ photo_of_exhibit.jpg`) lets visitors hold up a printed map; the robot navigates to the matching location.
|
- Image search (`search/ photo_of_exhibit.jpg`) lets visitors hold up a printed map; the robot navigates to the matching location.
|
||||||
- YOLO classes trimmed to people-only if the venue doesn't need object safety.
|
- YOLO classes trimmed to people-only if the venue doesn't need object safety.
|
||||||
|
|
||||||
**What you change to switch use cases:**
|
**What you change to switch use cases:**
|
||||||
1. `Config/marcus_prompts.yaml` — persona + task descriptions
|
1. `Config/marcus_prompts.yaml` — persona + task descriptions
|
||||||
2. `Config/config_Voice.json::stt.wake_words_en` — the name people call the robot
|
2. `Config/config_Voice.json::stt.wake_words` — the name (+ fuzzy variants) people call the robot
|
||||||
3. `Config/config_Vision.json::tracked_classes` — relevant object set
|
3. `Config/config_Vision.json::tracked_classes` — relevant object set
|
||||||
4. `Config/config_Brain.json::subsystems.{lidar,voice,imgsearch,autonomous}` — enable what you need
|
4. `Config/config_Brain.json::subsystems.{lidar,voice,imgsearch,autonomous}` — enable what you need
|
||||||
5. Data under `Data/History/Places/places.json` — learned locations
|
5. Data under `Data/History/Places/places.json` — learned locations
|
||||||
@ -174,6 +175,7 @@ No code changes required for either deployment.
|
|||||||
Vision/ Navigation/ Voice/ Lidar/
|
Vision/ Navigation/ Voice/ Lidar/
|
||||||
YOLO, imgsearch goal_nav, builtin_mic, SLAM engine
|
YOLO, imgsearch goal_nav, builtin_mic, SLAM engine
|
||||||
patrol, odom builtin_tts, (subprocess)
|
patrol, odom builtin_tts, (subprocess)
|
||||||
|
wake_detector,
|
||||||
marcus_voice
|
marcus_voice
|
||||||
│
|
│
|
||||||
▼
|
▼
|
||||||
@ -253,7 +255,7 @@ Marcus/
|
|||||||
├── Brain/ orchestrator, parser, executor, memory
|
├── Brain/ orchestrator, parser, executor, memory
|
||||||
├── Vision/ YOLO + image-guided search
|
├── Vision/ YOLO + image-guided search
|
||||||
├── Navigation/ goal nav, patrol, odometry
|
├── Navigation/ goal nav, patrol, odometry
|
||||||
├── Voice/ built-in mic, built-in TTS, Whisper loop
|
├── Voice/ built-in mic, TTS, energy wake detector, faster-whisper STT
|
||||||
├── Autonomous/ exploration state machine
|
├── Autonomous/ exploration state machine
|
||||||
├── Lidar/ SLAM engine (subprocess)
|
├── Lidar/ SLAM engine (subprocess)
|
||||||
├── Server/ WebSocket interface
|
├── Server/ WebSocket interface
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -49,10 +49,14 @@ import numpy as np
|
|||||||
@dataclass
|
@dataclass
|
||||||
class WakeConfig:
|
class WakeConfig:
|
||||||
sample_rate: int = 16_000
|
sample_rate: int = 16_000
|
||||||
# RMS (int16 units) above which we consider a chunk to be speech.
|
# RMS (int16 units) FLOOR for "this chunk is speech". The effective
|
||||||
# G1 on-board mic at normal speaking distance has rms ≈ 500-1500
|
# threshold is max(speech_threshold, ambient_baseline * adaptive_mult)
|
||||||
# during speech and ≈ 40-100 in silence. 150 is a safe middle ground.
|
# so this is only a minimum guarantee — the detector adapts upward
|
||||||
speech_threshold: float = 150.0
|
# in noisy rooms but never below this floor.
|
||||||
|
# G1 far-field mic at normal speaking distance has rms ~ 80-400 for
|
||||||
|
# quiet speech, 400-1500 for clear speech. 80 catches quiet speech;
|
||||||
|
# raise to 120-150 if fan/typing noise triggers false wakes.
|
||||||
|
speech_threshold: float = 80.0
|
||||||
# How long a burst of speech must last to count as a "word".
|
# How long a burst of speech must last to count as a "word".
|
||||||
min_word_duration_s: float = 0.20
|
min_word_duration_s: float = 0.20
|
||||||
max_word_duration_s: float = 1.50
|
max_word_duration_s: float = 1.50
|
||||||
@ -63,6 +67,13 @@ class WakeConfig:
|
|||||||
cooldown_s: float = 1.50
|
cooldown_s: float = 1.50
|
||||||
# RMS window size — we analyze this many ms of audio per step.
|
# RMS window size — we analyze this many ms of audio per step.
|
||||||
chunk_ms: int = 50
|
chunk_ms: int = 50
|
||||||
|
# Adaptive: how many *recent silent* chunks to average for the noise
|
||||||
|
# floor, and the multiplier applied on top. effective_threshold =
|
||||||
|
# max(speech_threshold, baseline * adaptive_mult).
|
||||||
|
adaptive_window_n: int = 50 # ~2.5 s at 50 ms chunks
|
||||||
|
adaptive_mult: float = 3.0
|
||||||
|
# Periodic diagnostic log cadence (seconds). 0 disables.
|
||||||
|
diag_log_sec: float = 3.0
|
||||||
|
|
||||||
|
|
||||||
class WakeDetector:
|
class WakeDetector:
|
||||||
@ -88,6 +99,24 @@ class WakeDetector:
|
|||||||
# chunks don't align with our internal analysis window).
|
# chunks don't align with our internal analysis window).
|
||||||
self._carry = np.zeros(0, dtype=np.int16)
|
self._carry = np.zeros(0, dtype=np.int16)
|
||||||
|
|
||||||
|
# Audio of the most-recent wake-triggering burst. Saved when the
|
||||||
|
# detector fires so callers (marcus_voice) can run Whisper on it
|
||||||
|
# and verify the word was actually "Sanad" rather than a cough.
|
||||||
|
self._burst_samples: list = [] # accumulated during SPEAKING
|
||||||
|
self._last_burst_audio: Optional[np.ndarray] = None
|
||||||
|
|
||||||
|
# Adaptive noise floor (rolling mean of RMS during SILENCE chunks).
|
||||||
|
self._baseline_buf = [] # last N silent-window RMS values
|
||||||
|
self._baseline = 0.0 # current estimate
|
||||||
|
self._peak_since_diag = 0.0 # max rms since last diag log
|
||||||
|
self._last_diag = time.time()
|
||||||
|
# Logger is optional — if the host app set up logging, use it.
|
||||||
|
try:
|
||||||
|
import logging
|
||||||
|
self._log = logging.getLogger("wake_detector")
|
||||||
|
except Exception:
|
||||||
|
self._log = None
|
||||||
|
|
||||||
# ── public API ────────────────────────────────────────────────
|
# ── public API ────────────────────────────────────────────────
|
||||||
|
|
||||||
def process(self, pcm_bytes: bytes) -> bool:
|
def process(self, pcm_bytes: bytes) -> bool:
|
||||||
@ -122,31 +151,73 @@ class WakeDetector:
|
|||||||
self._state = self.STATE_SILENCE
|
self._state = self.STATE_SILENCE
|
||||||
self._silence_run = 0
|
self._silence_run = 0
|
||||||
self._carry = np.zeros(0, dtype=np.int16)
|
self._carry = np.zeros(0, dtype=np.int16)
|
||||||
|
self._burst_samples = []
|
||||||
|
|
||||||
|
def get_last_burst(self) -> Optional[np.ndarray]:
|
||||||
|
"""
|
||||||
|
Return the int16 PCM samples of the most-recent wake-triggering
|
||||||
|
burst, or None if no wake has fired yet. Used by marcus_voice to
|
||||||
|
verify the triggering word was actually 'Sanad' before proceeding.
|
||||||
|
"""
|
||||||
|
return self._last_burst_audio
|
||||||
|
|
||||||
# ── internal ──────────────────────────────────────────────────
|
# ── internal ──────────────────────────────────────────────────
|
||||||
|
|
||||||
def _step(self, window: np.ndarray) -> bool:
|
def _step(self, window: np.ndarray) -> bool:
|
||||||
rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))
|
rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))
|
||||||
is_speech = rms > self.cfg.speech_threshold
|
|
||||||
|
|
||||||
|
# Effective threshold = max(config floor, adaptive baseline * mult)
|
||||||
|
eff = self.cfg.speech_threshold
|
||||||
|
if self._baseline > 0:
|
||||||
|
eff = max(eff, self._baseline * self.cfg.adaptive_mult)
|
||||||
|
is_speech = rms > eff
|
||||||
|
|
||||||
|
# Track peak for diag. Log periodically so you can *see* what the
|
||||||
|
# detector is hearing — invaluable when "not hearing me" happens.
|
||||||
|
if rms > self._peak_since_diag:
|
||||||
|
self._peak_since_diag = rms
|
||||||
now = time.time()
|
now = time.time()
|
||||||
|
if self.cfg.diag_log_sec > 0 and (now - self._last_diag) >= self.cfg.diag_log_sec:
|
||||||
|
if self._log is not None:
|
||||||
|
self._log.info(
|
||||||
|
"wake: peak=%.0f baseline=%.0f eff_threshold=%.0f state=%s",
|
||||||
|
self._peak_since_diag, self._baseline, eff, self._state,
|
||||||
|
)
|
||||||
|
self._peak_since_diag = 0.0
|
||||||
|
self._last_diag = now
|
||||||
|
|
||||||
if now < self._cooldown_until:
|
if now < self._cooldown_until:
|
||||||
return False # silent during cooldown
|
return False # silent during cooldown
|
||||||
|
|
||||||
if self._state == self.STATE_SILENCE:
|
if self._state == self.STATE_SILENCE:
|
||||||
|
# Learn the noise floor ONLY in silence — so speech bursts
|
||||||
|
# don't pull the baseline up and lock us out of wake.
|
||||||
|
if not is_speech:
|
||||||
|
self._baseline_buf.append(rms)
|
||||||
|
if len(self._baseline_buf) > self.cfg.adaptive_window_n:
|
||||||
|
self._baseline_buf.pop(0)
|
||||||
|
if self._baseline_buf:
|
||||||
|
self._baseline = sum(self._baseline_buf) / len(self._baseline_buf)
|
||||||
if is_speech:
|
if is_speech:
|
||||||
self._state = self.STATE_SPEAKING
|
self._state = self.STATE_SPEAKING
|
||||||
self._speech_start = self._sample_cursor
|
self._speech_start = self._sample_cursor
|
||||||
self._silence_run = 0
|
self._silence_run = 0
|
||||||
|
# Begin capturing the burst audio for later Whisper verify.
|
||||||
|
self._burst_samples = [window.copy()]
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# STATE_SPEAKING
|
# STATE_SPEAKING
|
||||||
|
# Accumulate every window (speech OR silence inside the burst)
|
||||||
|
# so we capture the full word + trailing quiet.
|
||||||
|
self._burst_samples.append(window.copy())
|
||||||
|
|
||||||
if is_speech:
|
if is_speech:
|
||||||
self._silence_run = 0
|
self._silence_run = 0
|
||||||
# Abort if the burst is longer than a single word — user is
|
# Abort if the burst is longer than a single word — user is
|
||||||
# just talking, not addressing the robot.
|
# just talking, not addressing the robot.
|
||||||
if self._sample_cursor - self._speech_start > self._max_speech:
|
if self._sample_cursor - self._speech_start > self._max_speech:
|
||||||
self._state = self.STATE_SILENCE
|
self._state = self.STATE_SILENCE
|
||||||
|
self._burst_samples = []
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Silent window inside SPEAKING — accumulate.
|
# Silent window inside SPEAKING — accumulate.
|
||||||
@ -156,6 +227,12 @@ class WakeDetector:
|
|||||||
self._state = self.STATE_SILENCE
|
self._state = self.STATE_SILENCE
|
||||||
self._silence_run = 0
|
self._silence_run = 0
|
||||||
if self._min_speech <= speech_len <= self._max_speech:
|
if self._min_speech <= speech_len <= self._max_speech:
|
||||||
|
# Snapshot burst audio for the caller's Whisper verify.
|
||||||
|
self._last_burst_audio = (
|
||||||
|
np.concatenate(self._burst_samples)
|
||||||
|
if self._burst_samples else None
|
||||||
|
)
|
||||||
|
self._burst_samples = []
|
||||||
self._cooldown_until = now + self.cfg.cooldown_s
|
self._cooldown_until = now + self.cfg.cooldown_s
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user