Update 2026-04-24 15:23:19

This commit is contained in:
kassam 2026-04-24 15:23:19 +04:00
parent 5d839d4f4e
commit 9485601e18
20 changed files with 214 additions and 85 deletions

View File

@ -5,11 +5,14 @@
"target_sample_rate": 16000 "target_sample_rate": 16000
}, },
"stt": { "stt": {
"backend": "faster_whisper", "_backend_comment": "'faster_whisper' (Whisper base.en int8 on CPU) or 'moonshine' (useful-sensors Moonshine via onnxruntime). Moonshine has a different error profile — worth trying when Whisper consistently mishears commands as short hallucinations like 'Yes.', 'Bye.', 'It.'. Moonshine requires `pip install moonshine-voice` on the Jetson.",
"backend": "moonshine",
"moonshine_language": "en",
"_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.", "_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",
"_mode_comment": "Three modes: 'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic. 'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise). 'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).", "_mode_comment": "Three modes. 'always_on_gated' (default, Sanad-style) = continuously transcribe+log every utterance to logs/transcript.log. Dispatch only when wake word is in the utterance. 'Sanad <cmd>' dispatches the cmd. Bare 'Sanad' → speak 'Yes' and treat the NEXT utterance as the command (within await_command_timeout_sec). 'wake_and_command' = classic acoustic wake first, then record a separate command (no always-listen overhead). 'always_on' = transcribe + dispatch everything, no gate (chatty).",
"mode": "wake_and_command", "mode": "wake_and_command",
"await_command_timeout_sec": 10.0,
"_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.", "_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
"always_on_speech_entry_rms": 150.0, "always_on_speech_entry_rms": 150.0,
@ -36,7 +39,7 @@
"_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.", "_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
"_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.", "_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
"_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.", "_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
"whisper_initial_prompt": "Robot voice command.", "whisper_initial_prompt": "Sanad. Turn left. Turn right. Move forward. Move back. Stop. Sit down. Stand up. Wave hello. Follow me. Come here. Go home. Look around. What do you see.",
"_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.", "_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
"wake_words": [ "wake_words": [
@ -85,21 +88,21 @@
"_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.", "_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
"speech_threshold": 400.0, "speech_threshold": 200.0,
"min_word_duration": 0.25, "min_word_duration": 0.25,
"max_word_duration": 2.50, "max_word_duration": 2.50,
"post_silence": 0.20, "post_silence": 0.20,
"wake_cooldown": 1.00, "wake_cooldown": 1.00,
"wake_chunk_ms": 50, "wake_chunk_ms": 50,
"wake_adaptive_window_n": 50, "wake_adaptive_window_n": 50,
"wake_adaptive_mult": 3.0, "wake_adaptive_mult": 2.0,
"wake_diag_log_sec": 3.0, "wake_diag_log_sec": 3.0,
"wake_ack": "tts", "wake_ack": "tts",
"_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).", "_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",
"_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.", "_wake_verify_comment": "ENABLED — confirm each acoustic wake with a lightweight Whisper decode on the triggering burst BEFORE speaking 'Yes' / opening the record window. Accept if the transcription contains any wake-word variant OR starts with s/sh/z (Whisper's consistent signature for mishearing 'Sanad' as 'Stop', 'Sand', 'Set', etc.). Reject pure noise (empty whisper) and clearly non-/sa-/ speech silently. Cost: ~1-2s extra per wake for the Whisper decode. Trade-off: slower wake response, but no 'Yes' played on stray loud noises. Set false if you prefer speed over false-wake filtering.",
"wake_verify_enabled": false, "wake_verify_enabled": true,
"_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.", "_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",

View File

@ -1,8 +0,0 @@
[
{
"time": "15:29:59",
"cmd": "hello, can you hear me",
"response": "Hello, can you hear me?",
"duration_s": 4.69
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "15:37:37",
"cmd": "turn left",
"response": "local command",
"duration_s": 0.0
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "16:22:54",
"cmd": "hi",
"response": "Hello! I am Sanad. How can I help you?",
"duration_s": 0.0
}
]

View File

@ -1,8 +0,0 @@
[
{
"time": "16:24:12",
"cmd": "what do you see",
"response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.",
"duration_s": 0.0
}
]

Binary file not shown.

Binary file not shown.

View File

@ -60,6 +60,30 @@ logging.basicConfig(
log = logging.getLogger("marcus_voice") log = logging.getLogger("marcus_voice")
# ── Transcript log ─────────────────────────────────────────────
# Every transcribed utterance (wake or not, command or not) is
# written here in a simple one-line-per-entry format so the operator
# can scan everything the mic heard without wading through the full
# voice.log. Rotates every 5 MB × 3 backups.
_TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log")
_transcript_log = logging.getLogger("transcript")
_transcript_log.setLevel(logging.INFO)
_transcript_log.propagate = False # don't double-emit
if not _transcript_log.handlers:
_th = RotatingFileHandler(
_TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8",
)
_th.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
_transcript_log.addHandler(_th)
def _log_transcript(action: str, text: str) -> None:
"""Write one line to logs/transcript.log.
action: 'HEARD' / 'WAKE' / 'CMD' / 'UNK' / ...
"""
_transcript_log.info("%-5s %s", action, (text or "").strip())
# Module-level vocabulary containers. EMPTY on import — populated by # Module-level vocabulary containers. EMPTY on import — populated by
# VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words, # VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words,
# command_vocab, garbage_patterns}. Config is the single source of truth; # command_vocab, garbage_patterns}. Config is the single source of truth;
@ -248,13 +272,35 @@ class VoiceModule:
if self._mic_gain != 1.0: if self._mic_gain != 1.0:
log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain) log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain)
# ── faster-whisper (lazy-init on first wake) ── # ── STT backend selection ──
# "faster_whisper" (default): Whisper base.en int8 on CPU via CTranslate2.
# "moonshine": useful-sensors Moonshine via moonshine-voice
# + onnxruntime. Different training, different
# error profile from Whisper — useful when
# Whisper's hallucinations (short "Yes.",
# "Bye.", "It.") are the failure mode.
self._backend_name = str(self._stt.get("backend", "faster_whisper")).lower()
# Lazy-init handles — concrete model loads on first wake so startup
# stays light. `False` marks a failed init so we don't keep retrying.
self._fw = None self._fw = None
self._moonshine = None
# ── Two-turn wake state (always_on_gated mode) ──
# self._awaiting_command: False = listening for wake.
# True = wake heard, next utterance is
# the command. Cleared after the
# command dispatches or after
# await_command_timeout_sec seconds
# so a stray "Sanad" doesn't arm
# forever.
self._awaiting_command = False
self._await_deadline = 0.0
self._running = False self._running = False
self._thread = None self._thread = None
self._cooldown_until = 0.0 self._cooldown_until = 0.0
log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)") log.info("VoiceModule initialized (wake=custom, stt=%s)", self._backend_name)
# ─── gain-applied mic read ──────────────────────────── # ─── gain-applied mic read ────────────────────────────
@ -299,6 +345,49 @@ class VoiceModule:
self._fw = None self._fw = None
return self._fw return self._fw
# ─── lazy moonshine init ──────────────────────────────
def _get_moonshine(self):
"""
Load Moonshine (useful-sensors) on first use. Requires
`pip install moonshine-voice` on the target. Returns None if
the package isn't available — caller should fall back.
"""
if self._moonshine is not None:
return self._moonshine if self._moonshine is not False else None
lang = self._stt.get("moonshine_language", "en")
log.info("Loading Moonshine: language=%s", lang)
try:
from moonshine_voice import Transcriber
from moonshine_voice.download import download_model
model_path, model_arch = download_model(language=lang)
self._moonshine = Transcriber(
model_path=model_path, model_arch=model_arch,
)
log.info("Moonshine ready: arch=%s", model_arch)
except Exception as e:
log.error("Moonshine init failed: %s — voice will be wake-only", e)
self._moonshine = False
return None
return self._moonshine
def _moonshine_decode(self, audio_f32: np.ndarray) -> str:
"""Run Moonshine one-shot on a float32 16kHz mono array. Returns ''."""
m = self._get_moonshine()
if m is None:
return ""
try:
result = m.transcribe_without_streaming(
audio_data=audio_f32, sample_rate=self._sample_rate,
)
lines = getattr(result, "lines", None) or []
text = " ".join(getattr(ln, "text", "") for ln in lines).strip()
log.info("moonshine: text=%r", text[:80])
return text
except Exception as e:
log.error("moonshine transcribe failed: %s", e)
return ""
# ─── command recording ──────────────────────────────── # ─── command recording ────────────────────────────────
def _record_command(self) -> np.ndarray: def _record_command(self) -> np.ndarray:
@ -445,7 +534,10 @@ class VoiceModule:
# ─── transcription ──────────────────────────────────── # ─── transcription ────────────────────────────────────
def _transcribe(self, audio_i16: np.ndarray) -> str: def _transcribe(self, audio_i16: np.ndarray) -> str:
"""int16 PCM → Whisper transcription. Returns '' on no-speech/noise.""" """int16 PCM → STT transcription. Returns '' on no-speech/noise."""
if self._backend_name == "moonshine":
return self._transcribe_moonshine(audio_i16, lenient=False)
fw = self._get_fw() fw = self._get_fw()
if fw is None: if fw is None:
return "" return ""
@ -596,6 +688,9 @@ class VoiceModule:
The downside (no Sanad nudge) is fine here because the acoustic The downside (no Sanad nudge) is fine here because the acoustic
detector has already gated out non-speech. detector has already gated out non-speech.
""" """
if self._backend_name == "moonshine":
return self._transcribe_moonshine(audio_i16, lenient=True)
fw = self._get_fw() fw = self._get_fw()
if fw is None: if fw is None:
return "" return ""
@ -634,6 +729,39 @@ class VoiceModule:
log.error("whisper-raw transcribe failed: %s", e) log.error("whisper-raw transcribe failed: %s", e)
return "" return ""
def _transcribe_moonshine(self, audio_i16: np.ndarray, lenient: bool) -> str:
"""
Moonshine decode path. Light DSP only (DC-removal + peak-normalize);
Moonshine has its own internal feature extraction, and the Whisper-
oriented pre-emphasis / 80 Hz HPF are not helpful here.
lenient=True mirrors _transcribe_raw: skip the garbage-pattern and
min-length filters so wake verify can see short /s-/ phonetic signals.
lenient=False applies the same rejection pipeline as _transcribe.
"""
if audio_i16.size == 0:
return ""
audio_f32 = audio_i16.astype(np.float32) / 32768.0
audio_f32 = audio_f32 - np.mean(audio_f32)
peak = float(np.abs(audio_f32).max())
if peak > 1e-4 and peak < 0.7:
audio_f32 = audio_f32 * (0.7 / peak)
text = self._moonshine_decode(audio_f32)
if not text:
return ""
if lenient:
return text
low = text.lower().strip().rstrip(".!?,")
vocab_exact = {c.lower() for c in COMMAND_VOCAB}
if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH:
if low not in WAKE_WORDS and low not in vocab_exact:
log.info("Rejecting likely noise transcription: %r", text)
return ""
return text
# ─── command transcription ──────────────────────────── # ─── command transcription ────────────────────────────
def _transcribe_command(self, audio_i16: np.ndarray) -> str: def _transcribe_command(self, audio_i16: np.ndarray) -> str:
@ -925,47 +1053,89 @@ class VoiceModule:
text = self._transcribe_command(audio) if audio.size else "" text = self._transcribe_command(audio) if audio.size else ""
if text: if text:
log.info("HEARD: %r", text) log.info("HEARD: %r", text)
# Gated mode: only dispatch if the wake word was _log_transcript("HEARD", text)
# spoken. Everything is still logged above so the
# operator has full visibility into what the mic
# is picking up.
if gated and not _has_wake_word(text):
log.info(" (no wake word — not dispatched)")
else:
if gated:
command = _strip_wake_word(text)
if command != text:
log.info(" wake-stripped: %r%r",
text, command)
# Bare wake word ("Sanad.", "Sanad") →
# speak a "Yes" ack, do NOT call the
# brain (it would hallucinate a random
# response from a 1-word prompt).
if not command:
log.info(" wake-only utterance — speaking ack")
try:
self._audio.speak(
self._messages.get("wake_heard", "Yes")
)
except Exception as e:
log.warning("wake-ack TTS failed: %s", e)
continue
else:
command = text
# Normalize near-misses ("Turn right up" → # ── Two-turn gated flow ────────────────────
# "turn right") so command_parser's regex # State A — listening for wake:
# fast-path can hit without an LLM round-trip. # non-wake utterance → log only, do not dispatch
# "Sanad <cmd>" → strip + dispatch now
# "Sanad" alone → speak "Yes", switch to
# state B ("awaiting command")
# State B — awaiting command (after Yes):
# any utterance → dispatch as the command,
# regardless of wake word.
# Then back to state A.
#
# This matches the SanadVoice/gemini_interact
# pattern: always transcribe + log every word,
# say "yes" on wake, treat the next utterance
# as the command.
# Timeout stale await-command state
if self._awaiting_command and time.time() > self._await_deadline:
log.info(" [awaiting-cmd] timed out — back to wake-listen")
self._awaiting_command = False
if self._awaiting_command:
# State B — next utterance is the command.
command = _strip_wake_word(text) # drop accidental "Sanad,"
if not command:
command = text # safety: never drop to empty
command = self._normalize_command(command) command = self._normalize_command(command)
log.info(" [awaiting-cmd] dispatching: %r", command)
_log_transcript("CMD", command)
self._awaiting_command = False
print(f' [Sanad] heard: "{command}"') print(f' [Sanad] heard: "{command}"')
if self._on_command: if self._on_command:
try: try:
self._on_command(command, "en") self._on_command(command, "en")
except Exception as e: except Exception as e:
log.error("on_command: %s", e, exc_info=True) log.error("on_command: %s", e, exc_info=True)
continue
# State A — listening for wake.
if gated and not _has_wake_word(text):
log.info(" (no wake word — logged only)")
_log_transcript("IGN", text)
continue
if gated:
command = _strip_wake_word(text)
if command != text:
log.info(" wake-stripped: %r%r",
text, command)
if not command:
# Bare "Sanad" — speak "Yes" and arm
# the next-utterance-as-command trigger.
log.info(" wake heard alone — speaking 'Yes', "
"next utterance will be treated as command")
_log_transcript("WAKE", text)
try:
self._audio.speak(
self._messages.get("wake_heard", "Yes")
)
except Exception as e:
log.warning("wake-ack TTS failed: %s", e)
self._awaiting_command = True
self._await_deadline = time.time() + float(
self._stt.get("await_command_timeout_sec", 10.0)
)
continue
else:
command = text
# Sanad + command in one utterance (e.g.
# "Sanad, turn left") → fuzzy-normalize + dispatch.
command = self._normalize_command(command)
_log_transcript("CMD", command)
print(f' [Sanad] heard: "{command}"')
if self._on_command:
try:
self._on_command(command, "en")
except Exception as e:
log.error("on_command: %s", e, exc_info=True)
else: else:
log.info("utterance rejected (empty/garbage after Whisper)") log.info("utterance rejected (empty/garbage after Whisper)")
_log_transcript("UNK", "(empty)")
else: else:
idle_peak_rms = max(idle_peak_rms, rms) idle_peak_rms = max(idle_peak_rms, rms)
idle_sum_rms += rms idle_sum_rms += rms