Voice: finalise on faster-whisper + energy wake, remove Vosk

Full-day voice-stack refactor. Experiments run and reverted:
- Gemini Live HTTP microservice (Python 3.8 env incompat, latency)
- Vosk grammar STT (English lexicon can't decode 'Sanad'; big model
  cold-load too slow on Jetson CPU)

Kept architecture:
- Voice/wake_detector.py — pure-numpy energy state machine with
  adaptive baseline, burst-audio capture for post-hoc verify.
- Voice/marcus_voice.py — orchestrator with 3 modes
  (wake_and_command / always_on / always_on_gated), hysteretic VAD,
  pre-silence trim (300 ms pre-roll), DSP pipeline (DC remove,
  80 Hz HPF, 0.97 pre-emphasis, peak-normalize), faster-whisper
  base.en int8 with beam=8 + temperature fallback [0,0.2,0.4],
  fuzzy-match canonicalisation, GARBAGE_PATTERNS + length filter,
  /s-/ phonetic wake-verify, full-turn debug WAV recording.

Config-driven vocab (zero hardcoded strings in Python):
- stt.wake_words (33 variants of 'Sanad')
- stt.command_vocab (68 canonical phrases)
- stt.garbage_patterns (17 Whisper noise outputs)
- stt.min_transcription_length, stt.command_vocab_cutoff

Command parser widened (Brain/command_parser.py):
- _RE_SIMPLE_DIR — bare direction + verb+direction combos
  ('left', 'go back', 'move forward', 'step right', ...)
- _RE_STOP_SIMPLE — bare stop/halt/wait/pause/freeze/hold
- All motion constants sourced from config_Navigation.json
  (move_map + step_duration_sec) via API/zmq_api.py; no more
  hardcoded 0.3 / 2.0 magic numbers.

API/audio_api.py — _play_pcm now uses AudioClient.PlayStream with
automatic resampling to 16 kHz (matches Sanad's proven pattern).

Removed:
- Voice/vosk_stt.py (and all Vosk references in marcus_voice.py)
- Models/vosk-model-small-en-us-0.15/ (40 MB model + zip)
- All Vosk keys from Config/config_Voice.json

Documentation synced across README, Doc/architecture.md,
Doc/pipeline.md, Doc/functions.md, Doc/controlling.md,
Doc/MARCUS_API.md, Doc/environment.md changelog.

Known limitation: faster-whisper base.en on Jetson CPU + G1
far-field mic yields ~50% command-transcription accuracy due
to model capacity and mic reverberation. Wake + ack + recording
+ trim + Whisper + fuzzy + brain + motion all verified working
end-to-end. Future improvement path (unused): close-talking USB
mic via pactl_parec, or Gemini Live via HTTP microservice.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
kassam 2026-04-24 14:32:28 +04:00
parent f45e12fae5
commit 5d839d4f4e
30 changed files with 1533 additions and 418 deletions

View File

@ -211,56 +211,58 @@ class AudioAPI:
# ─── G1 SPEAKER PLAYBACK (raw PCM, kept for future backends) ─────────
def _play_pcm(self, audio_16k: np.ndarray) -> float:
"""Play 16kHz mono int16 on G1 speaker. Returns duration."""
def _play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
"""
Play mono int16 PCM on the G1 speaker.
`rate` is the sample rate of the incoming `audio`; we always
resample to self._target_rate (16 kHz) before sending because the
G1 speaker hardware only honors that rate if you hand it 24 kHz
PCM, it plays ~1.5x too fast. This matches the Sanad pattern.
Uses AudioClient.PlayStream (the high-level API) with a fresh
stream_id + STOP_PLAY bracket on either side so a prior stream
can't blend into this one.
"""
if not self._sdk_available:
log.warning("SDK not available, cannot play audio")
return 0.0
from unitree_sdk2py.g1.audio.g1_audio_api import (
ROBOT_API_ID_AUDIO_START_PLAY,
ROBOT_API_ID_AUDIO_STOP_PLAY,
)
src_rate = int(rate) if rate else self._target_rate
audio = self._resample(audio, src_rate) # → self._target_rate
if audio.size == 0:
return 0.0
from unitree_sdk2py.g1.audio.g1_audio_api import ROBOT_API_ID_AUDIO_STOP_PLAY
app_name = self._spk["app_name"]
# Stop previous stream
# Stop any prior stream before opening a new one.
self._client._Call(
ROBOT_API_ID_AUDIO_STOP_PLAY,
json.dumps({"app_name": app_name}),
)
time.sleep(0.3)
time.sleep(0.15)
# Build params — unique stream_id every call
pcm = audio_16k.tobytes()
sid = f"s_{int(time.time() * 1000)}"
param = json.dumps({
"app_name": app_name,
"stream_id": sid,
"sample_rate": self._target_rate,
"channels": 1,
"bits_per_sample": 16,
})
self._client.PlayStream(app_name, sid, audio.tobytes())
# Single call — full buffer
self._client._CallRequestWithParamAndBin(
ROBOT_API_ID_AUDIO_START_PLAY, param, list(pcm)
)
duration = len(audio_16k) / self._target_rate
time.sleep(duration + 0.5)
duration = len(audio) / self._target_rate
# Margin covers DDS buffer drain before STOP cuts playback short.
time.sleep(duration + 0.3)
self._client._Call(
ROBOT_API_ID_AUDIO_STOP_PLAY,
json.dumps({"app_name": app_name}),
)
log.info("Played: %.1fs", duration)
log.info("Played: %.1fs (src=%d Hz → hw=%d Hz)",
duration, src_rate, self._target_rate)
return duration
def play_pcm(self, audio_16k: np.ndarray) -> float:
def play_pcm(self, audio: np.ndarray, rate: int = None) -> float:
"""Public wrapper for playing PCM audio."""
return self._play_pcm(audio_16k)
return self._play_pcm(audio, rate=rate)
# ─── MIC RECORDING ───────────────────────────────────

View File

@ -80,6 +80,11 @@ def send_cmd(cmd: str):
_ensure_sock().send_string(json.dumps({"cmd": cmd}))
# Load MOVE_MAP from navigation config (pure data, safe at import time)
# Load navigation constants from config (pure data, safe at import time).
# MOVE_MAP[direction] = (vx, vy, vyaw). STEP_DURATION_SEC is how long one
# "step" of a bare directional command lasts (2 s at default velocities
# ≈ 60 cm forward or 34° turn). Both live in config_Navigation.json so
# motion can be retuned without editing Python.
_nav = load_config("Navigation")
MOVE_MAP = {k: tuple(v) for k, v in _nav["move_map"].items()}
STEP_DURATION_SEC = float(_nav.get("step_duration_sec", 2.0))

View File

@ -4,7 +4,7 @@ Handles place memory, odometry, session recall, help, examples
"""
import re
import time
from API.zmq_api import send_vel, gradual_stop
from API.zmq_api import send_vel, gradual_stop, MOVE_MAP, STEP_DURATION_SEC
from API.memory_api import mem, place_save, place_goto, places_list_str
from API.odometry_api import odom, ODOM_AVAILABLE
from API.camera_api import get_frame
@ -36,6 +36,30 @@ _RE_WALK_STEP = re.compile(
r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I)
_RE_TURN_STEP = re.compile(
r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I)
# Simple one-shot motion — one word or verb+direction, no counts/units.
# All default to a ~2 s motion at the normal velocity. Kept local so the
# user doesn't eat a 5 s Qwen round-trip for a trivial "go back".
#
# Matches:
# "left" / "right" / "forward" / "back" / "backward"
# "go back" / "step back" / "move back" / "walk back" / "run back"
# "go forward" / "step forward" / "move forward" / "walk forward"
# "go left" / "move right" / "step left" / etc.
# "head forward" / "head back"
# Does NOT match multi-word phrases like "walk to the chair" — those
# still fall through to Qwen where they belong.
_RE_SIMPLE_DIR = re.compile(
r"^(?:(?:walk|go|move|step|run|head)\s+)?"
r"(forward|back(?:ward)?|left|right)$",
re.I,
)
# Bare stop / pause words — no need to ask Qwen what "stop" means.
_RE_STOP_SIMPLE = re.compile(
r"^(?:stop|halt|wait|pause|stay|freeze|hold|stand\s+still|don'?t\s+move)$",
re.I,
)
_RE_PATROL_RT = re.compile(
r"^patrol[/:]\s*(.+)$", re.I)
_RE_LAST_CMD = re.compile(
@ -115,9 +139,10 @@ def try_local_command(cmd: str) -> bool:
if odom:
odom.walk_distance(meters)
else:
vx, _, _ = MOVE_MAP["forward"]
t0 = time.time()
while time.time() - t0 < meters / 0.3:
send_vel(vx=0.3)
while time.time() - t0 < meters / abs(vx):
send_vel(vx=vx)
time.sleep(0.05)
gradual_stop()
return True
@ -128,9 +153,10 @@ def try_local_command(cmd: str) -> bool:
if odom:
odom.walk_distance(meters, direction="backward")
else:
vx, _, _ = MOVE_MAP["backward"]
t0 = time.time()
while time.time() - t0 < meters / 0.2:
send_vel(vx=-0.2)
while time.time() - t0 < meters / abs(vx):
send_vel(vx=vx)
time.sleep(0.05)
gradual_stop()
return True
@ -144,9 +170,13 @@ def try_local_command(cmd: str) -> bool:
if odom:
odom.turn_degrees(degrees)
else:
# vyaw magnitude comes from MOVE_MAP["left"]; duration is
# abs(degrees)/(vyaw_deg_per_sec). vyaw in config is rad/s.
_, _, vyaw_mag = MOVE_MAP["left"]
vyaw_deg_per_sec = abs(vyaw_mag) * 180.0 / 3.14159265
vyaw = vyaw_mag if degrees > 0 else -vyaw_mag
duration = abs(degrees) / vyaw_deg_per_sec
t0 = time.time()
vyaw = 0.3 if degrees > 0 else -0.3
duration = abs(degrees) / 17.2
while time.time() - t0 < duration:
send_vel(vyaw=vyaw)
time.sleep(0.05)
@ -156,9 +186,11 @@ def try_local_command(cmd: str) -> bool:
m = _RE_WALK_STEP.match(cmd)
if m:
direction = (m.group(1) or "forward").lower()
if direction.startswith("back"):
direction = "backward"
steps = int(m.group(2))
vx = -0.2 if direction.startswith("back") else 0.3
duration = 2.0 * steps
vx, _, _ = MOVE_MAP[direction]
duration = STEP_DURATION_SEC * steps
t0 = time.time()
while time.time() - t0 < duration:
send_vel(vx=vx)
@ -170,8 +202,8 @@ def try_local_command(cmd: str) -> bool:
if m:
direction = m.group(1).lower()
steps = int(m.group(2)) if m.group(2) else 1
vyaw = 0.3 if direction == "left" else -0.3
duration = 2.0 * steps
_, _, vyaw = MOVE_MAP[direction]
duration = STEP_DURATION_SEC * steps
t0 = time.time()
while time.time() - t0 < duration:
send_vel(vyaw=vyaw)
@ -179,6 +211,31 @@ def try_local_command(cmd: str) -> bool:
gradual_stop()
return True
# ── BARE / SIMPLE DIRECTIONAL COMMANDS ───────────────────────────────
# "left", "right", "forward", "back", "go back", "move forward",
# "step right", "walk back", "head forward" — any one-word direction
# or verb+direction with no explicit count. Duration and velocities
# come entirely from config_Navigation.json (MOVE_MAP +
# step_duration_sec) — no magic numbers here.
m = _RE_SIMPLE_DIR.match(cmd)
if m:
direction = m.group(1).lower()
if direction.startswith("back"):
direction = "backward"
vx, vy, vyaw = MOVE_MAP[direction]
t0 = time.time()
while time.time() - t0 < STEP_DURATION_SEC:
send_vel(vx=vx, vy=vy, vyaw=vyaw)
time.sleep(0.05)
gradual_stop()
return True
# ── BARE STOP / PAUSE ─────────────────────────────────────────────────
m = _RE_STOP_SIMPLE.match(cmd)
if m:
gradual_stop()
return True
# ── NAMED PATROL ROUTE ───────────────────────────────────────────────
m = _RE_PATROL_RT.match(cmd)
if m:
@ -234,16 +291,10 @@ def try_local_command(cmd: str) -> bool:
for phrase, (reverse_dir, _) in move_words.items():
if phrase in cl:
print(f" Undoing: '{c}' → reversing with '{reverse_dir}'")
dur, t0 = 2.0, time.time()
if reverse_dir in ("left", "right"):
vyaw = 0.3 if reverse_dir == "left" else -0.3
while time.time() - t0 < dur:
send_vel(vyaw=vyaw)
time.sleep(0.05)
else:
vx = 0.3 if reverse_dir == "forward" else -0.2
while time.time() - t0 < dur:
send_vel(vx=vx)
vx, vy, vyaw = MOVE_MAP[reverse_dir]
t0 = time.time()
while time.time() - t0 < STEP_DURATION_SEC:
send_vel(vx=vx, vy=vy, vyaw=vyaw)
time.sleep(0.05)
gradual_stop()
return True

View File

@ -3,6 +3,8 @@
"min_steps_before_check": 3,
"scan_interval_s": 0.4,
"rotation_speed": 0.3,
"_step_duration_comment": "Duration of one 'step' for bare directional commands ('go back', 'turn right', etc.). 2.0s at move_map velocities ≈ 60 cm forward, 40 cm back, 34° turn. Change here and every regex fast-path in command_parser.py uses the new value.",
"step_duration_sec": 2.0,
"move_map": {
"forward": [0.3, 0.0, 0.0],
"backward": [-0.2, 0.0, 0.0],

View File

@ -5,14 +5,119 @@
"target_sample_rate": 16000
},
"stt": {
"backend": "custom_acoustic",
"_comment": "Pure-DSP wake detector in Voice/wake_detector.py. No ML.",
"speech_threshold": 150.0,
"min_word_duration": 0.20,
"max_word_duration": 1.50,
"post_silence": 0.30,
"wake_cooldown": 1.50,
"wake_chunk_ms": 50
"backend": "faster_whisper",
"_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",
"_mode_comment": "Three modes: 'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic. 'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise). 'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).",
"mode": "wake_and_command",
"_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
"always_on_speech_entry_rms": 150.0,
"always_on_silence_exit_rms": 70.0,
"always_on_silence_duration_sec": 0.8,
"always_on_min_utterance_sec": 0.3,
"always_on_max_utterance_sec": 12.0,
"always_on_idle_log_sec": 5.0,
"always_on_ambient_mult": 1.4,
"always_on_ambient_window_chunks": 100,
"whisper_model": "base.en",
"whisper_device": "cpu",
"whisper_compute_type": "int8",
"_whisper_tuning_comment": "base.en is the only model that decodes fast enough on Jetson Orin NX CPU. TESTED: small.en takes 10-12s per 1s burst (unusable); base.en runs ~2-3s per burst. tiny.en is even faster (~1s) but noticeably worse accuracy. If accuracy is poor on base.en (garbled transcriptions), the fix is hardware — switch to a close-talking USB mic (Hollyland) via mic.backend:pactl_parec. small.en cached in ~/.cache/huggingface/hub/ if you want to experiment again — try it on an x86 dev machine to see the accuracy gain before blaming Jetson.",
"mic_gain": 1.0,
"whisper_beam_size": 8,
"whisper_no_speech_threshold": 0.85,
"whisper_log_prob_threshold": -1.8,
"whisper_compression_ratio_threshold": 3.0,
"whisper_temperature_fallback": [0.0, 0.2, 0.4],
"_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
"_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
"_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
"whisper_initial_prompt": "Robot voice command.",
"_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
"wake_words": [
"sanad", "sannad", "sennad", "sunnad", "sinnad", "sonnad",
"sanat", "sunnat", "sonnat", "sinnat", "sennat",
"sanid", "sanud", "saned", "sanod", "sanaad",
"senad", "sinad", "sonad", "sunad",
"sanah", "sanath", "sanadh", "sonadh",
"samad", "somad", "sumad",
"thanad", "zanad",
"sa nad", "san ad", "san odd", "san add"
],
"_wake_words_exclude_comment": "DELIBERATELY EXCLUDED from wake_words: 'said', 'sent', 'sand', 'sandy', 'sunday', 'signed', 'synod', 'sonata', 'sonnet', 'senate', 'sane', 'saint', 'sana'. These collide with common English and would false-trigger the gate.",
"command_vocab": [
"what do you see", "what can you see", "look around",
"come to me", "come here", "come back", "come closer",
"approach", "get closer", "come",
"go home", "go back", "go forward", "go backward",
"go left", "go right", "go",
"sit down", "stand up", "sit", "stand",
"raise arm", "lower arm", "wave hello", "wave", "point",
"turn left", "turn right", "turn around",
"move forward", "move backward", "move back",
"move left", "move right",
"walk forward", "walk backward", "walk back",
"step forward", "step back", "step left", "step right",
"forward", "backward", "back", "left", "right",
"patrol", "stop", "halt", "wait", "pause", "freeze", "hold",
"hello", "hi", "hey", "help",
"who are you", "where are you", "where am i", "what is your name",
"remember this", "forget", "do it again", "repeat", "undo",
"follow me", "stay here"
],
"command_vocab_cutoff": 0.72,
"_garbage_comment": "Whisper's known 'no phonetic content' outputs on low-SNR audio. YouTube outros, filler words, single-letter hallucinations. Any transcription matching one of these (or shorter than min_transcription_length) is rejected before fuzzy-match — treated as silence.",
"garbage_patterns": [
"thanks for watching", "thank you for watching",
"thank you", "thanks",
"bye", "goodbye",
".", "you", "yeah",
"okay", "ok",
"um", "uh", "hmm", "mm",
"i", "a"
],
"min_transcription_length": 3,
"_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
"speech_threshold": 400.0,
"min_word_duration": 0.25,
"max_word_duration": 2.50,
"post_silence": 0.20,
"wake_cooldown": 1.00,
"wake_chunk_ms": 50,
"wake_adaptive_window_n": 50,
"wake_adaptive_mult": 3.0,
"wake_diag_log_sec": 3.0,
"wake_ack": "tts",
"_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",
"_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.",
"wake_verify_enabled": false,
"_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
"speech_entry_rms": 400.0,
"silence_exit_rms": 200.0,
"_vad_tuning_comment": "silence_duration_sec = how long of quiet ends an utterance. 0.6 cuts short commands fast (good UX) but may clip a thinking pause. ambient_mult = effective_entry multiplier over measured ambient (cmd is entry = max(speech_entry_rms, ambient * mult * 1.8)). Smaller = more eager, catches quieter speech. 1.5 matches the wake-mult behavior.",
"silence_duration_sec": 0.6,
"max_record_sec": 5.0,
"min_record_sec": 0.4,
"ambient_probe_sec": 0.2,
"ambient_mult": 1.5,
"ambient_cap_rms": 200.0,
"_recording_comment": "Debug recording — save every command turn's audio to Data/Voice/Recordings/ as WAV. Filename includes epoch timestamp + transcription slug so you can replay what Whisper got and compare to what it heard. Rotates to keep most recent N files. Filename prefixes: 'cmd_*' = successful transcription, 'unk_*' = empty/rejected.",
"recording_enabled": true,
"recording_keep_count": 50,
"command_cooldown_sec": 1.5,
"post_tts_settle_sec": 0.4,
"_post_tts_settle_comment": "Time the mic ignores input AFTER the robot finishes speaking. Too short → TTS echo becomes false utterance. Too long → user speaks during the dead window and first syllables are clipped. 0.4s matches the G1 speaker decay at mic_gain=1.0; raise if you bump mic_gain above 1.5, lower if users report 'it cut off my first word'."
},
"mic": {
"backend": "builtin_udp",

View File

@ -25,6 +25,15 @@ main_prompt: |
- speak: actually describe what you are doing OR what the camera shows right now. Do NOT copy example text. First person. English.
- abort: null normally; "obstacle detected" / "unsafe command" / "cannot comply" with actions=[] when unsafe.
- CRITICAL — IF THE COMMAND IS UNCLEAR OR NOT AN ACTION:
If the input text is a single unclear word (like "I", "alright", "ok", "um"),
a random phrase ("I have a lot of beauty", "turn turn turn"), noise, a
greeting ("hello", "hi"), or anything that isn't clearly a movement /
arm / vision / memory instruction — DO NOT INVENT a command. Instead
reply with:
{{"actions":[],"arm":null,"speak":"Sorry, I didn't understand that — please repeat","abort":"command not understood"}}
Better to ask again than to guess and perform the wrong action.
Examples (learn the STRUCTURE, don't reuse the speak text):
"turn right" → {{"actions":[{{"move":"right","duration":2.0}}],"arm":null,"speak":"Turning right","abort":null}}
"walk 2 steps" → {{"actions":[{{"move":"forward","duration":2.0}}],"arm":null,"speak":"Walking forward","abort":null}}

View File

@ -29,7 +29,7 @@
| Image-search step delay | `Vision/marcus_imgsearch.py` | `STEP_DELAY` 0.4 s → 0.15 s. |
| Built-in G1 microphone | `Voice/builtin_mic.py` (new), `API/audio_api.py`, `Config/config_Voice.json` | Mic now reads from UDP multicast `239.168.123.161:5555` (G1 on-board array mic) instead of the Hollyland USB. Config key `mic.backend` defaults to `"builtin_udp"`; set to `"pactl_parec"` to fall back to the old path. |
| Built-in G1 TTS | `Voice/builtin_tts.py` (new), `API/audio_api.py` | `AudioAPI.speak(text)` now calls `client.TtsMaker(text, speaker_id)` directly. No MP3/WAV plumbing, no internet, no edge-tts/Piper. English only — `speak()` refuses non-ASCII to avoid the G1's silent Arabic→Chinese fallback. |
| Gemini voice deleted | `Voice/marcus_gemini_voice.py` removed | `_init_voice()` now spawns `Voice.marcus_voice.VoiceModule` (Whisper wake + command STT). No more WebSocket, no more asyncio event loop, no API key. |
| Voice stack finalised | `Voice/marcus_voice.py`, `Voice/wake_detector.py` | Custom energy wake detector (pure numpy) + Whisper verify + faster-whisper command STT + fuzzy-match to canonical commands. Vosk experiment reverted; Gemini Live reverted. Single local STT engine. |
| Subsystem flags | `Config/config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` | `init_brain()` skips any subsystem with `false`. Defaults: lidar+voice+autonomous ON, imgsearch OFF. |
| Robot persona → Sanad | Multiple | Wake words `["sanad","sannad","sanat","sunnat"]`; all prompts say "You are Sanad"; banner reads `SANAD AI BRAIN — READY`; hardcoded self-intro says "I am Sanad". Project/file/module names unchanged. |
| Logger rename | `Core/log_backend.py` (was `Core/Logger.py`) | Case-only collision with `Core/logger.py` removed — repo now clones cleanly on macOS/Windows. Public API unchanged: `from Core.logger import log`. |
@ -766,9 +766,9 @@ SAFETY:
---
## 15. Voice API (mic + TTS + STT)
## 15. Voice API (mic + TTS + wake + STT)
New pipeline as of 2026-04-21. Replaces the Gemini live WebSocket + edge-tts/Piper stack.
Current pipeline: G1 mic → custom energy wake detector → Whisper verify → TtsMaker "Yes" → record → faster-whisper transcribe → fuzzy-match canonical command → brain. Replaces all prior experiments (Gemini Live WebSocket, Vosk grammar, edge-tts / Piper).
### Mic — `Voice.builtin_mic.BuiltinMic`
@ -800,9 +800,36 @@ tts.speak("Hello, I am Sanad", block=True) # synth + play on G1 body speaker
Used by `AudioAPI.speak(text)` internally; application code should call `audio_api.speak(...)` rather than BuiltinTTS directly.
### Wake + command loop — `Voice.marcus_voice.VoiceModule`
### Wake detection — `Voice.wake_detector.WakeDetector`
Four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` detects the wake word, `small` transcribes commands.
Pure-numpy energy state machine with adaptive noise floor. Classifies any 0.35-1.5 s speech burst as a candidate wake, captures the audio for post-hoc verification.
```python
from Voice.wake_detector import WakeDetector, WakeConfig
cfg = WakeConfig(
sample_rate=16_000,
speech_threshold=400.0, # min RMS floor — above noise
min_word_duration_s=0.35, # filter out coughs (<0.35s)
max_word_duration_s=1.50, # filter out sentences
post_silence_s=0.30, # how long silence marks word end
cooldown_s=1.50, # min gap between fires
chunk_ms=50, # RMS analysis window
adaptive_window_n=50, # rolling mean of idle RMS
adaptive_mult=3.0, # effective = max(floor, baseline×mult)
)
det = WakeDetector(cfg)
while True:
pcm = mic.read_chunk(1024)
if det.process(pcm):
burst = det.get_last_burst() # audio that triggered wake
break
```
Config under `config_Voice.json::stt.{speech_threshold, min_word_duration, …}`.
### Voice orchestrator — `Voice.marcus_voice.VoiceModule`
Drives the full pipeline: wake detector → Whisper verify → record → transcribe → fuzzy-match → dispatch. Three operating modes (`wake_and_command`, `always_on`, `always_on_gated`) selectable via `stt.mode`.
```python
from API.audio_api import AudioAPI
@ -818,7 +845,9 @@ voice.start() # background thread
voice.stop()
```
Wake words are configured in `config_Voice.json::stt.wake_words_en`. The brain's `_init_voice()` wires `on_command` to `process_command(text)` + `audio_api.speak(reply)`.
Vocabulary (`wake_words`, `command_vocab`, `garbage_patterns`) is loaded from `config_Voice.json::stt.*` at `VoiceModule.__init__`. All thresholds, Whisper params, and mode selection live in the same config — no Python edits required to tune. See `Doc/controlling.md` → "Voice" for the tuning-knobs cheat sheet.
The brain's `_init_voice()` wires `on_command` to `process_command(text)``audio_api.speak(reply)`.
### AudioAPI — `API.audio_api.AudioAPI`

View File

@ -15,7 +15,7 @@
- **ZMQ bind moved to `init_zmq()`** — no longer runs at import time; multiprocessing children (LiDAR SLAM worker) can safely re-import.
- **G1 built-in microphone** via UDP multicast `239.168.123.161:5555``Voice/builtin_mic.py` replaces Hollyland/`parec` as the default mic.
- **G1 built-in TTS** via `client.TtsMaker()``Voice/builtin_tts.py`. English only. Edge-tts / Piper / XTTS paths removed.
- **Gemini voice module deleted** — Whisper wake-word + command STT path is now authoritative (`Voice/marcus_voice.py`).
- **Voice stack finalised** — custom energy wake detector (`Voice/wake_detector.py`) + faster-whisper command STT (`Voice/marcus_voice.py`). Whisper verifies each acoustic wake before acking. Gemini voice module and Vosk grammar STT both tried and removed.
- **Subsystem flags**`config_Brain.json::subsystems.{lidar, voice, imgsearch, autonomous}` let you selectively skip heavy boot stages.
- **Conditional inner-loop sleeps** — goal_nav / autonomous / imgsearch no longer pay unconditional per-step naps.
- **Core/Logger.py → Core/log_backend.py** — case-only name collision with `logger.py` resolved; repo clones cleanly on macOS/Windows.
@ -66,7 +66,7 @@ Marcus/
│ ├── config_Memory.json # session/places paths
│ ├── config_Network.json # Jetson IPs (eth0/wlan0), ports
│ ├── config_ImageSearch.json # search defaults
│ ├── config_Voice.json # mic (builtin_udp|pactl_parec), TTS backend, wake words, mic_udp group/port
│ ├── config_Voice.json # mic, TTS, wake detector thresholds, Whisper params, wake_words/command_vocab/garbage_patterns vocab lists, VAD thresholds
│ ├── config_LiDAR.json # Livox Mid-360 connection + SLAM engine params
│ └── marcus_prompts.yaml # All Qwen-VL prompts (main, goal, patrol, talk, verify, 2× imgsearch)
│ # Total: 12 JSON files + 1 YAML. (config_Memory.json removed 2026-04-21.)
@ -83,10 +83,11 @@ Marcus/
│ ├── audio_api.py # AudioAPI — speak() via G1 TtsMaker, record() via BuiltinMic
│ └── lidar_api.py # LiDAR wrapper: init_lidar(), obstacle_ahead(), get_lidar_status()
├── Voice/ # Mic + TTS + wake-word STT
├── Voice/ # Mic + TTS + wake detector + faster-whisper STT
│ ├── builtin_mic.py # G1 array mic via UDP multicast 239.168.123.161:5555
│ ├── builtin_tts.py # BuiltinTTS — client.TtsMaker(text, speaker_id)
│ └── marcus_voice.py # VoiceModule — Whisper tiny (wake) + small (command) state machine
│ ├── wake_detector.py # Pure-numpy energy wake detector (WakeDetector, WakeConfig) with adaptive baseline
│ └── marcus_voice.py # VoiceModule — orchestrates wake → verify → record → Whisper → dispatch
├── Brain/ # Decision logic — imports ONLY from API/
│ ├── marcus_brain.py # Orchestrator: init_brain(), process_command(), run_terminal()
@ -188,7 +189,8 @@ Marcus/
│ goal_nav.py │ │ builtin_mic.py │
│ patrol.py │ │ builtin_tts.py │
│ marcus_odometry.py │ │ marcus_voice.py │
│ marcus_yolo.py │ │ (Whisper + TtsMaker) │
│ marcus_yolo.py │ │ wake_detector.py │
│ │ │ (Whisper + TtsMaker) │
│ marcus_imgsearch.py │ └──────────┬──────────────┘
└──────────────┬───────────┘ │
│ │
@ -487,10 +489,10 @@ Supports text-only search (no reference image) using hint description.
### Voice/
Mic, TTS and wake-word pipeline. All three files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
Mic, TTS, energy-based wake detector, and faster-whisper STT pipeline. All files run only when `config_Brain.json::subsystems.voice == true`. Everything is local — no internet, no WebSocket, no cloud API. TTS is English-only by design (the G1 firmware maps non-English to Chinese, which is unusable).
#### `builtin_mic.py` (~180 lines, new 2026-04-21)
Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer.
#### `builtin_mic.py` (~180 lines)
Joins the G1's on-board audio multicast group (`239.168.123.161:5555`) and buffers incoming int16 mono 16 kHz PCM. Thread-safe ring buffer. Identical pattern to `Project/Sanad/voice/audio_io.py::BuiltinMic`.
**Exports:**
- `BuiltinMic(group, port, buf_max, read_timeout)` — init (idempotent)
@ -499,20 +501,41 @@ Ported from `Project/Sanad/voice/audio_io.py::BuiltinMic`. Joins the G1's on-boa
- `read_seconds(s)` — convenience for "record `s` seconds"
- `flush()` — drop buffered audio (called while TTS plays, to avoid echo)
#### `builtin_tts.py` (~70 lines, new 2026-04-21)
Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input (the G1 silently maps Arabic to Chinese, which confuses everyone).
#### `builtin_tts.py` (~70 lines)
Thin wrapper around `unitree_sdk2py.g1.audio.g1_audio_client.AudioClient.TtsMaker(text, speaker_id)`. Synchronous — blocks until the estimated playback duration elapses. Refuses non-ASCII input.
**Exports:**
- `BuiltinTTS(audio_client, default_speaker_id=0)` — init
- `speak(text, speaker_id=None, block=True)` — synth+play on G1 body speaker
#### `marcus_voice.py` (~340 lines, rewired 2026-04-21)
Always-listening English voice loop with a four-state machine (`IDLE → WAKE_HEARD → PROCESSING → SPEAKING`). Whisper `tiny` listens for the wake word "Sanad" on 2-second chunks; Whisper `small` transcribes the full command. Mic input comes from `BuiltinMic`; responses go through `audio_api.speak()``BuiltinTTS`.
#### `wake_detector.py` (~240 lines)
Pure-numpy energy-envelope state machine. Fires a wake event when it sees a short speech burst (0.2-1.5 s) sized to match a single spoken word like "Sanad", followed by a clear silence. No ML, no lexicon — just amplitude classification.
Adaptive noise-floor baseline: learns ambient RMS during idle, raises the effective threshold proportionally, so the detector works the same in a quiet room and a noisy lab. Captures the triggering burst audio (`get_last_burst()`) so callers can verify it was actually "Sanad" before acking. Exists because Vosk/Whisper both failed on the G1 far-field mic for short non-English proper nouns.
**Exports:**
- `VoiceModule(audio_api, on_command=cb)` — init
- `start()` — spawn background thread
- `stop()` — graceful teardown
- `WakeDetector(cfg)` with `WakeConfig(sample_rate, speech_threshold, min_word_duration_s, max_word_duration_s, post_silence_s, cooldown_s, chunk_ms, adaptive_window_n, adaptive_mult, diag_log_sec)`
- `process(pcm_bytes) -> bool` — feed audio, returns True once per spoken "word"
- `reset()`, `get_last_burst() -> np.ndarray | None`
#### `marcus_voice.py` (~1000 lines)
Voice orchestrator. Reads from `BuiltinMic`, runs the `WakeDetector`, verifies the wake burst with a lightweight Whisper decode, records the command with hysteretic VAD (speech_entry / silence_exit thresholds, adaptive to measured ambient), trims leading silence before Whisper, transcribes with faster-whisper, fuzzy-matches against `command_vocab` to canonicalize near-misses ("Turn right up" → "turn right"), then dispatches to the brain callback.
Three operating modes selectable via `stt.mode`:
- `wake_and_command` (default): classic acoustic wake → TTS "Yes" → record → Whisper → brain
- `always_on`: no wake, transcribe every utterance, dispatch all
- `always_on_gated`: transcribe everything, only dispatch utterances containing "Sanad"
Wake verify rule: Whisper's decode must either contain a wake-word variant (`stt.wake_words`) OR start with `s/sh/z` — Whisper's consistent signature for mishearing "Sanad" as "Stop"/"Set"/"Sand". Pure silence / non-s speech is rejected silently.
**Module-level** (populated at `VoiceModule.__init__` from config):
- `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` — loaded from `config_Voice.json::stt.*`, single source of truth
- `_has_wake_word(text)`, `_strip_wake_word(text)` — iterative until stable, handles "Sanad. Sanad." → ""
- `_closest_command(text, cutoff)` — difflib fuzzy-match against `COMMAND_VOCAB`
**Exports:**
- `VoiceModule(audio_api, on_command=cb, on_wake=None)` — init
- `start()` / `stop()` — background thread lifecycle
- `is_running` property
---

View File

@ -79,16 +79,30 @@ Or skip prompt: `python3 -m Client.marcus_cli --ip 192.168.123.164 --port 8765`
## Voice
- **Wake word:** "Sanad" (variants "sannad", "sanat", "sunnat" — see `config_Voice.json::stt.wake_words_en`)
- **Wake word:** "Sanad" (Whisper mishears it as "Stop", "Sand", "Set", "Send" — all accepted via the /s-/ phonetic rule; see `config_Voice.json::stt.wake_words` for the 33 fuzzy variants).
- **Mic:** G1 on-board array mic, captured via UDP multicast `239.168.123.161:5555` (16 kHz mono, 16-bit PCM). No USB mic needed.
- **STT:** Whisper `tiny` (wake detection) + Whisper `small` (command transcription) — both run locally.
- **Wake detection:** custom energy-envelope state machine (pure numpy, no ML) — fires on any 0.35-1.5 s speech burst followed by silence. Adaptive to room ambient.
- **Wake verify:** lightweight Whisper decode on the triggering burst. Accepts if it contains a wake-word variant OR starts with `s`/`sh`/`z` (Whisper's consistent signature for "Sanad"). Rejects pure noise / non-s speech silently.
- **STT (command):** faster-whisper `base.en` int8 on CPU — loads ~1.5 s on first wake, cached after.
- **TTS:** Unitree `client.TtsMaker()` → G1 body speaker. English only.
- **Barge-in:** say something while Marcus is speaking and the mic buffer flushes on the next command.
- **Barge-in:** the mic is muted during TTS playback, then flushed on return to listening.
Interaction flow: say "Sanad" → hear *"Listening"* → speak your command → see transcript on console → Marcus answers through the speaker.
Interaction flow: say "Sanad" → hear *"Yes"* → speak your command → see transcript on console → Marcus answers through the speaker.
Three voice modes selectable via `config_Voice.json::stt.mode`:
- `wake_and_command` (default) — wake word required before each command
- `always_on` — continuously transcribe + dispatch every utterance
- `always_on_gated` — always listen + log, dispatch only if utterance contains "Sanad"
To disable voice entirely, set `subsystems.voice: false` in `config_Brain.json` — Marcus will boot text-only ~2 s faster.
**Tuning knobs** (when false wakes or rejected real wakes) — all in `config_Voice.json::stt`:
- Too many false wakes from coughs/claps → raise `speech_threshold` or `min_word_duration`
- Real "Sanad" being rejected → check the log line `wake REJECTED — %r` to see what Whisper heard; widen `wake_words` if needed
- Commands transcribed wrong → check `whisper: lp=%.2f nsp=%.2f text=%r` log line; lower `whisper_no_speech_threshold` or tighten `whisper_log_prob_threshold`
- "I didn't catch that" on silence → raise `min_transcription_length`
- Latency too high → set `wake_ack: "none"` (skip "Yes" TTS, save ~1.7 s/cycle)
---
## Command Reference
@ -230,7 +244,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json:
| `llama runner process has terminated: %!w(<nil>)` | Ollama compute graph OOM | Already capped at `num_batch=128 / num_ctx=2048`. Check `free -h`; kill stale Ollama runners: `pkill -f "ollama runner"` |
| Traceback mentioning `multiprocessing/spawn.py` + ZMQ port 5556 | Old import-time ZMQ bind regressed | Pull latest `API/zmq_api.py` — must call `init_zmq()` from the parent only |
| `[Camera] No frame for 10s` during warmup | Ollama blocking the main thread, or USB bandwidth | Warmup is ~1015 s on first Qwen load; subsequent commands are fast |
| Wake word never fires | Whisper hearing something else | Check `logs/voice.log` — if it transcribes as "sunnat"/"sannat", add your variant to `config_Voice.json::stt.wake_words_en` |
| Wake word never fires | Energy burst below floor, or Whisper verify rejecting | Check `logs/voice.log` — if you see `wake REJECTED — 'X'`, add X's root variant to `config_Voice.json::stt.wake_words`. If `baseline=0` persists, your ambient exceeds the floor — raise `speech_threshold`. |
| Mic silent | G1 audio service not publishing | Run `python3 Voice/builtin_mic.py` standalone — must print "OK — mic is capturing audio" |
| `[LiDAR] No data yet (will keep trying)` | SLAM worker still spawning (normal) or Livox network | First ~5 s normal. If persists, `ping 192.168.123.120` |
| Client can't connect | Wrong IP or server not running | Verify `ollama serve &` and `python3 -m Server.marcus_server` are both up |
@ -243,7 +257,7 @@ Most values configurable in `Config/config_Network.json` and `config_Voice.json:
|------|------|
| Brain code | `~/Marcus/Brain/` |
| Server | `~/Marcus/Server/marcus_server.py` |
| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,marcus_voice}.py` |
| Voice | `~/Marcus/Voice/{builtin_mic,builtin_tts,wake_detector,marcus_voice}.py` |
| Config | `~/Marcus/Config/` |
| Prompts | `~/Marcus/Config/marcus_prompts.yaml` |
| YOLO model | `~/Marcus/Models/yolov8m.pt` |
@ -261,7 +275,7 @@ See `Doc/functions.md` for the full function inventory (AST-generated).
## Language policy
**English only.** Arabic was removed from the codebase on 2026-04-21:
- `Config/config_Voice.json::stt.wake_words_en` — only English variants (`sanad`, `sannad`, `sanat`, `sunnat`)
- `Config/config_Voice.json::stt.wake_words` — English fuzzy variants only (33 entries), excludes common English words that would false-trigger (`said`, `sand`, `sunday`, etc.)
- `Config/marcus_prompts.yaml` — no Arabic examples left in any of the 7 prompts
- `API/audio_api.py::speak(text)` — rejects non-ASCII (the G1 TtsMaker silently maps Arabic to Chinese, which nobody wants)
- `Brain/marcus_brain.py` — greeting and talk-pattern regexes match English only

View File

@ -384,3 +384,5 @@ Config file (`Config/config_Vision.json`):
| 2026-04-21 | **Subprocess leak fix**: `AudioAPI._record_parec` now wraps `Popen` in try/finally with `terminate → wait(1.0) → kill` fallback; orphan `parec` processes can no longer survive Ctrl-C. Last-resort `proc.kill()` catches only `OSError` (not bare `except`). |
| 2026-04-21 | **Modelfile corrected**: `Models/Modelfile` now `FROM qwen2.5vl:3b` (was `:7b`) with a header explaining it's an optional build template — runtime uses `ollama pull qwen2.5vl:3b` directly. |
| 2026-04-21 | **Final verification**: 14-dimension smoke test green — no Arabic, no dead dirs, 0 orphan keys, every FileHandler rotates, no bare `except: pass`, no stale `Models_marcus` / `marcus_llava` refs, 25/25 modules import. |
| 2026-04-24 | **Voice finalised on faster-whisper + custom energy wake**. Added `Voice/wake_detector.py` (pure-numpy energy state machine, adaptive noise floor, burst-audio capture for verify). Rewrote `Voice/marcus_voice.py` around it: three operating modes (`wake_and_command` / `always_on` / `always_on_gated`), hysteretic record VAD, pre-speech silence trim (300 ms pre-roll preserved), faster-whisper `base.en` int8 CPU decode, fuzzy-match canonicalisation against `command_vocab`, `GARBAGE_PATTERNS` + length filter for noise hallucinations, `/s-/` phonetic wake verify (accepts Whisper mishearings of "Sanad" like "Stop"/"Set"/"Sand"). Tried and reverted: Gemini Live WebSocket (Python 3.8 incompatibility + latency), Vosk grammar STT (English lexicon can't decode "Sanad"; big model cold-load too slow on Jetson). All voice tunables (33 wake_words, 68 command_vocab, 17 garbage_patterns, ~25 threshold/VAD/Whisper keys) live in `config_Voice.json::stt.*` — zero hardcoded strings in Voice/. |
| 2026-04-24 | **Command parser widened**: `Brain/command_parser.py` now has `_RE_SIMPLE_DIR` (`left`, `go back`, `move forward`, `step right`, etc.) and `_RE_STOP_SIMPLE` (`stop`, `halt`, `wait`, `pause`, `freeze`) regex fast-paths — these bare-direction / bare-stop commands now skip Qwen entirely (~50 ms vs ~5 s). Motion velocities and step duration pulled from `config_Navigation.json::{move_map, step_duration_sec}` via `API/zmq_api.py`; command_parser no longer contains hardcoded `0.3` / `2.0` magic numbers. |

View File

@ -49,13 +49,14 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra
---
## `Voice/` — mic + TTS + STT
## `Voice/` — mic + TTS + wake + STT
| File | Public API |
|---|---|
| `builtin_mic.py` | `_find_g1_local_ip()` + **class `BuiltinMic`** |
| `builtin_tts.py` | **class `BuiltinTTS`** |
| `marcus_voice.py` | **class `State`** (IDLE/WAKE_HEARD/PROCESSING/SPEAKING), **class `VoiceModule`** |
| `wake_detector.py` | **dataclass `WakeConfig`** + **class `WakeDetector`** |
| `marcus_voice.py` | module-level `WAKE_WORDS`, `COMMAND_VOCAB`, `GARBAGE_PATTERNS` (populated from config), helpers `_has_wake_word`, `_strip_wake_word`, `_strip_wake_word_once`, `_closest_command`, **class `VoiceModule`** |
**`Voice.builtin_mic.BuiltinMic`** — G1 UDP multicast mic:
`__init__(group, port, buf_max, read_timeout)`, `start()`, `stop()`, `read_chunk(num_bytes)`, `read_seconds(seconds)`, `flush()`; internal `_recv_loop`.
@ -63,8 +64,11 @@ Script only. Prepends `PROJECT_ROOT` to `sys.path`, then calls `Brain.marcus_bra
**`Voice.builtin_tts.BuiltinTTS`** — wraps `AudioClient.TtsMaker`:
`__init__(audio_client, default_speaker_id=0)`, `speak(text, speaker_id=None, block=True)`.
**`Voice.marcus_voice.VoiceModule`** — Whisper wake + command STT:
`__init__(audio_api, on_command)`, `start()`, `stop()`, props `state`, `is_running`. Internal state machine: `_do_idle`, `_do_wake_heard`, `_do_processing`; helpers `_load_whisper`, `_transcribe`, `_check_wake_word`, `_record_chunk`, `_record_until_silence`, `_voice_loop`.
**`Voice.wake_detector.WakeDetector`** — pure-numpy energy wake:
`__init__(cfg: WakeConfig)`, `process(pcm_bytes) -> bool`, `reset()`, `get_last_burst() -> np.ndarray | None`. Internal: `_step(window)` state-machine per 50 ms analysis window; adaptive `_baseline_buf` rolling mean of idle-silence RMS; captures triggering burst audio for post-hoc Whisper verify.
**`Voice.marcus_voice.VoiceModule`** — voice orchestrator. Drives the wake detector, verifies each fire with a lightweight Whisper decode (wake-word substring OR /s-/ phonetic match), records commands with a hysteretic VAD, trims pre-speech silence, transcribes via faster-whisper, fuzzy-normalises near-misses to canonical commands, dispatches to brain.
`__init__(audio_api, on_command=None, on_wake=None)`, `start()`, `stop()`, `is_running` property. Internal: `_get_fw()` lazy faster-whisper loader, `_read_mic_raw` / `_read_mic_gained`, `_record_command()` with adaptive VAD + pre-silence trim, `_transcribe(audio)` Whisper decode + garbage filter, `_transcribe_command(audio)` thin wrapper, `_normalize_command(text)` fuzzy-match to `COMMAND_VOCAB`, `_handle_wake()` / `_voice_loop()` / `_voice_loop_wake()` / `_voice_loop_always_on(gated)`, `_save_unk_wav(audio)` for post-mortem debugging.
---

View File

@ -51,22 +51,43 @@ G1 body mic (array)
Voice/builtin_mic.py::BuiltinMic
ring buffer (64 KB) + read_chunk(n)
Voice/marcus_voice.py::VoiceModule (IDLE → WAKE_HEARD → PROCESSING → SPEAKING)
├─ IDLE : 2-s chunks → Whisper tiny → wake-word match ("sanad"/"sannad"/…)
├─ WAKE_HEARD : audio_api.speak("Listening") → G1 body speaker
├─ PROCESSING : record-until-silence → Whisper small → transcribed text
└─ on_command(text, "en")
Voice/wake_detector.py::WakeDetector
pure-numpy energy state machine (SILENCE ⇄ SPEAKING)
adaptive noise floor: eff_threshold = max(speech_threshold, baseline × 3)
fires on 0.35-1.5 s bursts followed by 0.3 s silence → captures burst audio
Voice/marcus_voice.py::VoiceModule._handle_wake()
├─ 1. Whisper verify on the burst audio:
│ text = faster-whisper(burst)
│ accept if _has_wake_word(text) OR startswith(s/sh/z)
│ reject otherwise (cough, clap, hello, okay) → silent return
├─ 2. audio_api.speak("Yes") → G1 body speaker (~1.5 s)
├─ 3. post_tts_settle_sec wait + mic flush
├─ 4. _record_command() — hysteretic VAD
│ speech_entry_rms / silence_exit_rms (adapt from wake baseline)
│ trim leading silence (keep 300 ms pre-roll) → tight clip for Whisper
├─ 5. _transcribe(audio)
│ faster-whisper (base.en int8 CPU)
│ beam_size=5, temperature=0, initial_prompt bias toward Sanad vocab
│ GARBAGE_PATTERNS + min_transcription_length reject noise hallucinations
├─ 6. _normalize_command(text)
│ difflib fuzzy-match vs stt.command_vocab
│ "Turn right up" → "turn right" (canonical form)
└─ 7. on_command(text, "en")
Brain/marcus_brain.py::process_command(text)
├─ regex fast-path → Brain/command_parser.py::try_local_command()
│ places · odometry walk/turn · patrol · session recall · goal_nav · auto on/off
│ places · odometry walk/turn · patrol · session recall · goal_nav
│ + SIMPLE_DIR ("go back", "right", "forward") · STOP_SIMPLE ("stop", "halt")
│ + NAT_GOAL_RE (naturalised goals like "the chair") · auto on/off
│ (~50 ms when matched — NO LLM call)
└─ else → _handle_llava(text)
├─ get_frame() (10×50 ms poll, no 1 s stall)
├─ API/llava_api.py::ask(text, img)
│ ollama.chat(qwen2.5vl:3b, num_batch=128, num_ctx=2048, num_predict=120)
│ → parse_json() → {actions, arm, speak, abort}
└─ Brain/executor.py::execute(d)
├─ actions → API/zmq_api.py::send_vel(vx, vy, vyaw) → Holosoma
├─ actions → MOVE_MAP[dir] → API/zmq_api.py::send_vel → Holosoma
├─ arm → API/arm_api.py (stub for now)
└─ abort → gradual_stop()
@ -77,9 +98,17 @@ API/audio_api.py::speak(text, lang="en")
├─ Voice/builtin_tts.py::BuiltinTTS.speak(text)
│ client.TtsMaker(text, speaker_id=0) — G1 on-board engine, English only
│ time.sleep(len(text) * 0.08)
└─ unmute mic → back to IDLE
└─ unmute mic → back to listening
```
**Config knobs** (all in `config_Voice.json::stt`):
- Wake: `speech_threshold` (floor), `min_word_duration`, `max_word_duration`, `post_silence`, `wake_cooldown`, `wake_adaptive_mult`, `wake_diag_log_sec`
- Verify: `wake_verify_enabled`
- Record: `speech_entry_rms`, `silence_exit_rms`, `silence_duration_sec`, `max_record_sec`, `min_record_sec`, `ambient_mult`, `ambient_cap_rms`
- Whisper: `whisper_model`, `whisper_compute_type`, `whisper_beam_size`, `whisper_no_speech_threshold`, `whisper_log_prob_threshold`, `whisper_initial_prompt`, `mic_gain`
- Vocab: `wake_words`, `command_vocab`, `garbage_patterns`, `command_vocab_cutoff`, `min_transcription_length`
- Mode: `mode` (`wake_and_command` | `always_on` | `always_on_gated`), `wake_ack` (`tts`|`none`)
---
## Terminal / WebSocket command pipeline (same brain, skips voice)
@ -169,7 +198,12 @@ Brain/command_parser.py — responds to "lidar status" queries
| `mic_udp.group/port` | config_Voice.json | where to join the G1 audio multicast |
| `mic_udp.read_timeout_sec` | config_Voice.json | `BuiltinMic.read_chunk` budget (default 0.04 s) |
| `tts.backend` | config_Voice.json | `builtin_ttsmaker` (only supported option) |
| `stt.wake_words_en` | config_Voice.json | Whisper matcher (`sanad` + variants) |
| `stt.wake_words` | config_Voice.json | 33 fuzzy variants of "Sanad" for the wake-verify substring match |
| `stt.command_vocab` | config_Voice.json | 68 canonical command phrases for fuzzy-normalization (`"turn right up"``"turn right"`) |
| `stt.garbage_patterns` | config_Voice.json | 17 Whisper noise-hallucinations to reject (`"thanks for watching"`, `"okay"`, etc.) |
| `stt.speech_threshold` etc. | config_Voice.json | energy wake detector thresholds — see `Doc/controlling.md` "Voice" for the full tuning matrix |
| `stt.whisper_*` | config_Voice.json | faster-whisper model, compute type, beam size, confidence gates, bias prompt |
| `stt.mode` | config_Voice.json | `wake_and_command` (default) / `always_on` / `always_on_gated` |
| `timeout_ms`, `stale_threshold_s`, `reconnect_delay_s` | config_Camera.json | RealSense frame timeout, reconnect trigger, initial backoff |
| `default_max_steps`, `step_delay_s`, `rotate_speed`, `min_steps_warmup` | config_ImageSearch.json | image-guided search rotation cadence (wired into `Vision/marcus_imgsearch.py`) |
| `default_walk_speed`, `dist_tolerance`, `angle_tolerance`, `safety_timeout_mult`, `dr_update_hz` | config_Odometry.json | precise motion control (wired into `Navigation/marcus_odometry.py`) |
@ -181,9 +215,14 @@ Brain/command_parser.py — responds to "lidar status" queries
| Step | Typical | Notes |
|---|---|---|
| Wake-word detect | 200500 ms | Whisper tiny on 2 s chunk |
| Record until silence | 18 s | depends on user speech |
| Whisper small STT | 5001500 ms | once per command |
| Wake-word detect | <100 ms | pure-numpy energy detector, 50 ms analysis windows |
| Wake verify (first wake) | ~2000 ms | includes faster-whisper `base.en` cold load |
| Wake verify (subsequent) | 300700 ms | Whisper cached, decodes ~0.5-1.5 s burst |
| "Yes" TTS ack | ~1500 ms | G1 firmware `TtsMaker` minimum |
| Record until silence | 15 s | depends on user speech; `max_record_sec=5` cap |
| Pre-silence trim | <1 ms | numpy slice |
| faster-whisper STT | 5001500 ms | `base.en` int8 on CPU, beam_size=5 |
| Fuzzy-match canonicalisation | <1 ms | difflib against 68 phrases |
| Camera frame fetch | <50 ms | poll loop, no 1 s blocking stall |
| Ollama Qwen2.5-VL | 8001500 ms | `num_batch=128 / num_ctx=2048 / num_predict=120` |
| Executor + ZMQ send | <10 ms | fire-and-forget PUB |

View File

@ -1,9 +0,0 @@
US English model for mobile Vosk applications
Copyright 2020 Alpha Cephei Inc
Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
Speed: 0.11xRT (desktop)
Latency: 0.15s (right context)

View File

@ -1,7 +0,0 @@
--sample-frequency=16000
--use-energy=false
--num-mel-bins=40
--num-ceps=40
--low-freq=20
--high-freq=7600
--allow-downsample=true

View File

@ -1,10 +0,0 @@
--min-active=200
--max-active=3000
--beam=10.0
--lattice-beam=2.0
--acoustic-scale=1.0
--frame-subsampling-factor=3
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
--endpoint.rule2.min-trailing-silence=0.5
--endpoint.rule3.min-trailing-silence=0.75
--endpoint.rule4.min-trailing-silence=1.0

View File

@ -1,17 +0,0 @@
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031

View File

@ -1,166 +0,0 @@
1 nonword
2 begin
3 end
4 internal
5 singleton
6 nonword
7 begin
8 end
9 internal
10 singleton
11 begin
12 end
13 internal
14 singleton
15 begin
16 end
17 internal
18 singleton
19 begin
20 end
21 internal
22 singleton
23 begin
24 end
25 internal
26 singleton
27 begin
28 end
29 internal
30 singleton
31 begin
32 end
33 internal
34 singleton
35 begin
36 end
37 internal
38 singleton
39 begin
40 end
41 internal
42 singleton
43 begin
44 end
45 internal
46 singleton
47 begin
48 end
49 internal
50 singleton
51 begin
52 end
53 internal
54 singleton
55 begin
56 end
57 internal
58 singleton
59 begin
60 end
61 internal
62 singleton
63 begin
64 end
65 internal
66 singleton
67 begin
68 end
69 internal
70 singleton
71 begin
72 end
73 internal
74 singleton
75 begin
76 end
77 internal
78 singleton
79 begin
80 end
81 internal
82 singleton
83 begin
84 end
85 internal
86 singleton
87 begin
88 end
89 internal
90 singleton
91 begin
92 end
93 internal
94 singleton
95 begin
96 end
97 internal
98 singleton
99 begin
100 end
101 internal
102 singleton
103 begin
104 end
105 internal
106 singleton
107 begin
108 end
109 internal
110 singleton
111 begin
112 end
113 internal
114 singleton
115 begin
116 end
117 internal
118 singleton
119 begin
120 end
121 internal
122 singleton
123 begin
124 end
125 internal
126 singleton
127 begin
128 end
129 internal
130 singleton
131 begin
132 end
133 internal
134 singleton
135 begin
136 end
137 internal
138 singleton
139 begin
140 end
141 internal
142 singleton
143 begin
144 end
145 internal
146 singleton
147 begin
148 end
149 internal
150 singleton
151 begin
152 end
153 internal
154 singleton
155 begin
156 end
157 internal
158 singleton
159 begin
160 end
161 internal
162 singleton
163 begin
164 end
165 internal
166 singleton

View File

@ -1,3 +0,0 @@
[
1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09
1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]

View File

@ -1 +0,0 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh

View File

@ -1,2 +0,0 @@
--left-context=3
--right-context=3

View File

@ -25,7 +25,7 @@ a Python brain.
| **Brain** (reason, speak, decide) | Parse commands, reason about vision, pick actions | **Qwen2.5-VL 3B** via Ollama | Jetson GPU |
| **Eyes** (see) | Real-time object/person detection | **YOLOv8m** (CUDA, FP16, 320 px, ~22 FPS) | Jetson GPU |
| **Eyes** (understand) | Open-ended scene understanding, reading, goal-verify | **Qwen2.5-VL** (same brain model) | Jetson GPU |
| **Ears** (hear) | Always-on wake-word + command transcription | **Whisper tiny** (wake) + **Whisper small** (STT) | Jetson CPU/GPU |
| **Ears** (hear) | Energy-based wake detector + command transcription | **Custom DSP wake** (numpy, no ML) + **faster-whisper base.en int8** (STT) | Jetson CPU |
| **Mouth** (speak) | On-robot TTS, no internet needed | **Unitree `TtsMaker`** (G1 firmware) | G1 body speaker |
| **Legs** (walk) | 29-DoF locomotion + balance | **Holosoma** RL policy (separate process, ONNX) | Jetson CPU |
| **Hands** (gesture) | Arm & hand actions | **GR00T N1.5** — pending; `API/arm_api.py` is a stub today | Jetson GPU (future) |
@ -54,7 +54,7 @@ Camera ─┘ ▼ ├─► Legs (Holosoma
Three input modalities, same command loop:
- **Voice** — say "**Sanad, what do you see?**" → wake word fires, Whisper transcribes, brain answers through the G1 speaker.
- **Voice** — say "**Sanad**" → energy detector fires, Whisper verifies the /sa-/ phoneme signature, robot replies "Yes" → speak your command → faster-whisper transcribes → brain answers through the G1 speaker.
- **Text** — type the same command into `run_marcus.py`'s terminal.
- **WebSocket (remote)**`Client/marcus_cli.py` or `Client/marcus_client.py` (Tkinter GUI) send commands from a workstation.
@ -84,7 +84,8 @@ There are two schools for combining them:
| Vision — open-ended scene understanding | same VLM | learned |
| Legs / locomotion | **RL policy** (Holosoma, ONNX) | learned |
| Arms / gestures | SDK action-ID lookup | **hand-coded** |
| Wake-word + STT | Whisper | learned |
| Wake word | Custom energy-envelope DSP (numpy) | hand-coded |
| STT (command) | faster-whisper base.en | learned |
| TTS | Unitree `TtsMaker` (on-robot DSP) | firmware |
| Glue between layers | Python + ZMQ + JSON | hand-coded |
@ -143,13 +144,13 @@ Same hardware, different prompts + wake word.
- **Prompts** rewrite: *"You are a museum guide. When a visitor asks about an exhibit, describe it in two sentences and invite them to ask follow-ups."*
- **Places** memory pre-loaded with exhibit waypoints; `patrol: exhibit_A → exhibit_B → exit` follows a tour.
- Wake word changed in `config_Voice.json::stt.wake_words_en`.
- Wake word variants in `config_Voice.json::stt.wake_words` (fuzzy list, handles Whisper mishearings of "Sanad").
- Image search (`search/ photo_of_exhibit.jpg`) lets visitors hold up a printed map; the robot navigates to the matching location.
- YOLO classes trimmed to people-only if the venue doesn't need object safety.
**What you change to switch use cases:**
1. `Config/marcus_prompts.yaml` — persona + task descriptions
2. `Config/config_Voice.json::stt.wake_words_en` — the name people call the robot
2. `Config/config_Voice.json::stt.wake_words` — the name (+ fuzzy variants) people call the robot
3. `Config/config_Vision.json::tracked_classes` — relevant object set
4. `Config/config_Brain.json::subsystems.{lidar,voice,imgsearch,autonomous}` — enable what you need
5. Data under `Data/History/Places/places.json` — learned locations
@ -174,6 +175,7 @@ No code changes required for either deployment.
Vision/ Navigation/ Voice/ Lidar/
YOLO, imgsearch goal_nav, builtin_mic, SLAM engine
patrol, odom builtin_tts, (subprocess)
wake_detector,
marcus_voice
@ -253,7 +255,7 @@ Marcus/
├── Brain/ orchestrator, parser, executor, memory
├── Vision/ YOLO + image-guided search
├── Navigation/ goal nav, patrol, odometry
├── Voice/ built-in mic, built-in TTS, Whisper loop
├── Voice/ built-in mic, TTS, energy wake detector, faster-whisper STT
├── Autonomous/ exploration state machine
├── Lidar/ SLAM engine (subprocess)
├── Server/ WebSocket interface

File diff suppressed because it is too large Load Diff

View File

@ -49,10 +49,14 @@ import numpy as np
@dataclass
class WakeConfig:
sample_rate: int = 16_000
# RMS (int16 units) above which we consider a chunk to be speech.
# G1 on-board mic at normal speaking distance has rms ≈ 500-1500
# during speech and ≈ 40-100 in silence. 150 is a safe middle ground.
speech_threshold: float = 150.0
# RMS (int16 units) FLOOR for "this chunk is speech". The effective
# threshold is max(speech_threshold, ambient_baseline * adaptive_mult)
# so this is only a minimum guarantee — the detector adapts upward
# in noisy rooms but never below this floor.
# G1 far-field mic at normal speaking distance has rms ~ 80-400 for
# quiet speech, 400-1500 for clear speech. 80 catches quiet speech;
# raise to 120-150 if fan/typing noise triggers false wakes.
speech_threshold: float = 80.0
# How long a burst of speech must last to count as a "word".
min_word_duration_s: float = 0.20
max_word_duration_s: float = 1.50
@ -63,6 +67,13 @@ class WakeConfig:
cooldown_s: float = 1.50
# RMS window size — we analyze this many ms of audio per step.
chunk_ms: int = 50
# Adaptive: how many *recent silent* chunks to average for the noise
# floor, and the multiplier applied on top. effective_threshold =
# max(speech_threshold, baseline * adaptive_mult).
adaptive_window_n: int = 50 # ~2.5 s at 50 ms chunks
adaptive_mult: float = 3.0
# Periodic diagnostic log cadence (seconds). 0 disables.
diag_log_sec: float = 3.0
class WakeDetector:
@ -88,6 +99,24 @@ class WakeDetector:
# chunks don't align with our internal analysis window).
self._carry = np.zeros(0, dtype=np.int16)
# Audio of the most-recent wake-triggering burst. Saved when the
# detector fires so callers (marcus_voice) can run Whisper on it
# and verify the word was actually "Sanad" rather than a cough.
self._burst_samples: list = [] # accumulated during SPEAKING
self._last_burst_audio: Optional[np.ndarray] = None
# Adaptive noise floor (rolling mean of RMS during SILENCE chunks).
self._baseline_buf = [] # last N silent-window RMS values
self._baseline = 0.0 # current estimate
self._peak_since_diag = 0.0 # max rms since last diag log
self._last_diag = time.time()
# Logger is optional — if the host app set up logging, use it.
try:
import logging
self._log = logging.getLogger("wake_detector")
except Exception:
self._log = None
# ── public API ────────────────────────────────────────────────
def process(self, pcm_bytes: bytes) -> bool:
@ -122,31 +151,73 @@ class WakeDetector:
self._state = self.STATE_SILENCE
self._silence_run = 0
self._carry = np.zeros(0, dtype=np.int16)
self._burst_samples = []
def get_last_burst(self) -> Optional[np.ndarray]:
"""
Return the int16 PCM samples of the most-recent wake-triggering
burst, or None if no wake has fired yet. Used by marcus_voice to
verify the triggering word was actually 'Sanad' before proceeding.
"""
return self._last_burst_audio
# ── internal ──────────────────────────────────────────────────
def _step(self, window: np.ndarray) -> bool:
rms = float(np.sqrt(np.mean(window.astype(np.float64) ** 2)))
is_speech = rms > self.cfg.speech_threshold
# Effective threshold = max(config floor, adaptive baseline * mult)
eff = self.cfg.speech_threshold
if self._baseline > 0:
eff = max(eff, self._baseline * self.cfg.adaptive_mult)
is_speech = rms > eff
# Track peak for diag. Log periodically so you can *see* what the
# detector is hearing — invaluable when "not hearing me" happens.
if rms > self._peak_since_diag:
self._peak_since_diag = rms
now = time.time()
if self.cfg.diag_log_sec > 0 and (now - self._last_diag) >= self.cfg.diag_log_sec:
if self._log is not None:
self._log.info(
"wake: peak=%.0f baseline=%.0f eff_threshold=%.0f state=%s",
self._peak_since_diag, self._baseline, eff, self._state,
)
self._peak_since_diag = 0.0
self._last_diag = now
if now < self._cooldown_until:
return False # silent during cooldown
if self._state == self.STATE_SILENCE:
# Learn the noise floor ONLY in silence — so speech bursts
# don't pull the baseline up and lock us out of wake.
if not is_speech:
self._baseline_buf.append(rms)
if len(self._baseline_buf) > self.cfg.adaptive_window_n:
self._baseline_buf.pop(0)
if self._baseline_buf:
self._baseline = sum(self._baseline_buf) / len(self._baseline_buf)
if is_speech:
self._state = self.STATE_SPEAKING
self._speech_start = self._sample_cursor
self._silence_run = 0
# Begin capturing the burst audio for later Whisper verify.
self._burst_samples = [window.copy()]
return False
# STATE_SPEAKING
# Accumulate every window (speech OR silence inside the burst)
# so we capture the full word + trailing quiet.
self._burst_samples.append(window.copy())
if is_speech:
self._silence_run = 0
# Abort if the burst is longer than a single word — user is
# just talking, not addressing the robot.
if self._sample_cursor - self._speech_start > self._max_speech:
self._state = self.STATE_SILENCE
self._burst_samples = []
return False
# Silent window inside SPEAKING — accumulate.
@ -156,6 +227,12 @@ class WakeDetector:
self._state = self.STATE_SILENCE
self._silence_run = 0
if self._min_speech <= speech_len <= self._max_speech:
# Snapshot burst audio for the caller's Whisper verify.
self._last_burst_audio = (
np.concatenate(self._burst_samples)
if self._burst_samples else None
)
self._burst_samples = []
self._cooldown_until = now + self.cfg.cooldown_s
return True
return False