Update 2026-04-24 15:23:19
This commit is contained in:
parent
5d839d4f4e
commit
9485601e18
@ -5,11 +5,14 @@
|
||||
"target_sample_rate": 16000
|
||||
},
|
||||
"stt": {
|
||||
"backend": "faster_whisper",
|
||||
"_backend_comment": "'faster_whisper' (Whisper base.en int8 on CPU) or 'moonshine' (useful-sensors Moonshine via onnxruntime). Moonshine has a different error profile — worth trying when Whisper consistently mishears commands as short hallucinations like 'Yes.', 'Bye.', 'It.'. Moonshine requires `pip install moonshine-voice` on the Jetson.",
|
||||
"backend": "moonshine",
|
||||
"moonshine_language": "en",
|
||||
"_comment": "Custom energy wake detector (instant, no ML) + faster-whisper base.en int8 on CPU for command transcription. Wake fires on any 0.2-1.5s speech burst; Whisper only runs on the recorded command, so it's ~10x less busy than a Whisper-polling setup.",
|
||||
|
||||
"_mode_comment": "Three modes: 'wake_and_command' = instant acoustic wake detector (no ML) hears 'Sanad', THEN records a ~2s command, transcribes once — fastest, most reliable on G1 mic. 'always_on' = continuous VAD → Whisper every utterance, dispatch all (chatty, LLM gets every noise). 'always_on_gated' = continuous transcribe, dispatch only utterances containing 'Sanad' (Sanad-style but Whisper hallucinates commands from TTS echo on G1 mic, creating feedback loops — keep as opt-in, not default).",
|
||||
"_mode_comment": "Three modes. 'always_on_gated' (default, Sanad-style) = continuously transcribe+log every utterance to logs/transcript.log. Dispatch only when wake word is in the utterance. 'Sanad <cmd>' dispatches the cmd. Bare 'Sanad' → speak 'Yes' and treat the NEXT utterance as the command (within await_command_timeout_sec). 'wake_and_command' = classic acoustic wake first, then record a separate command (no always-listen overhead). 'always_on' = transcribe + dispatch everything, no gate (chatty).",
|
||||
"mode": "wake_and_command",
|
||||
"await_command_timeout_sec": 10.0,
|
||||
|
||||
"_always_on_comment": "Tunables for always_on mode only. Lower entry threshold catches quieter speech (since there's no wake gate). silence_duration is how long of quiet closes an utterance. idle_log_sec is how often to print an ambient-level summary so you can see what the mic is hearing even when nobody is talking.",
|
||||
"always_on_speech_entry_rms": 150.0,
|
||||
@ -36,7 +39,7 @@
|
||||
"_whisper_temp_comment": "Temperature fallback: Whisper first tries greedy (T=0). If the output fails its own confidence gates, it retries at 0.2, then 0.4. On noisy audio this often rescues a bad greedy decode — the small random noise in softmax helps unstick the decoder from a local minimum.",
|
||||
"_whisper_gates_comment": "Looser than faster-whisper defaults (0.6 / -1.0 / 2.4) because G1 far-field mic audio has poor SNR and frequently falls below the default log-prob. A segment is dropped only if (no_speech_prob > 0.85 AND log_prob < -1.8) — lets more shaky-but-real speech through. Hallucination risk is cushioned by the GARBAGE_PATTERNS filter downstream and the fuzzy-match to command_vocab.",
|
||||
"_initial_prompt_comment": "EMPTY BY DEFAULT. Any bias prompt leaks — on unclear/short audio Whisper echoes the prompt verbatim as the transcription (seen repeatedly: 'This is a robot assistant' hallucinated from <1s of ambient). Clean, unbiased decode is worse at recognising 'Sanad' specifically but doesn't produce phantom commands. Set to a single short cue if you want nudging and can tolerate occasional echoes.",
|
||||
"whisper_initial_prompt": "Robot voice command.",
|
||||
"whisper_initial_prompt": "Sanad. Turn left. Turn right. Move forward. Move back. Stop. Sit down. Stand up. Wave hello. Follow me. Come here. Go home. Look around. What do you see.",
|
||||
|
||||
"_vocab_comment": "Tunable vocab lists for voice post-processing. wake_words = fuzzy variants of the wake word 'Sanad' Whisper might produce (used by always_on_gated mode). command_vocab = canonical commands the voice layer fuzzy-matches transcriptions against (e.g. 'Turn right up' → 'turn right' so command_parser's regex catches it). Edit these to add new vocabulary — NO code change required.",
|
||||
"wake_words": [
|
||||
@ -85,21 +88,21 @@
|
||||
|
||||
|
||||
"_wake_comment": "Custom energy-based wake detector with adaptive noise floor. speech_threshold is a FLOOR — the effective trigger is max(speech_threshold, ambient_baseline * wake_adaptive_mult). CRITICAL: speech_threshold must be ABOVE your room's ambient RMS, else state stays in SPEAKING forever and baseline can't learn. Check logs — if `baseline=0` and `state=SPEAKING` persist with peak values well below your 'Sanad' peaks, raise this floor. Normal G1 room: 80-150. Noisy room (fans, HVAC): 300-500. Measured-here room ambient ≈ 250-350, so 400 gives margin.",
|
||||
"speech_threshold": 400.0,
|
||||
"speech_threshold": 200.0,
|
||||
"min_word_duration": 0.25,
|
||||
"max_word_duration": 2.50,
|
||||
"post_silence": 0.20,
|
||||
"wake_cooldown": 1.00,
|
||||
"wake_chunk_ms": 50,
|
||||
"wake_adaptive_window_n": 50,
|
||||
"wake_adaptive_mult": 3.0,
|
||||
"wake_adaptive_mult": 2.0,
|
||||
"wake_diag_log_sec": 3.0,
|
||||
|
||||
"wake_ack": "tts",
|
||||
"_wake_ack_comment": "tts = spoken 'Yes' via TtsMaker (~1.7s G1 firmware latency). none = silent, relies on terminal print (fast).",
|
||||
|
||||
"_wake_verify_comment": "DISABLED for speed. When enabled, runs a ~3s Whisper decode on each wake burst and rejects non-/sa-/ speech — good for filtering coughs/claps, but adds 3s latency per wake. With it off, every acoustic wake fires — faster response, more false wakes from loud noises (but those drop silently at the 'no speech' recording stage, so user impact is near-zero). Set true again if background noise is triggering too many false wakes.",
|
||||
"wake_verify_enabled": false,
|
||||
"_wake_verify_comment": "ENABLED — confirm each acoustic wake with a lightweight Whisper decode on the triggering burst BEFORE speaking 'Yes' / opening the record window. Accept if the transcription contains any wake-word variant OR starts with s/sh/z (Whisper's consistent signature for mishearing 'Sanad' as 'Stop', 'Sand', 'Set', etc.). Reject pure noise (empty whisper) and clearly non-/sa-/ speech silently. Cost: ~1-2s extra per wake for the Whisper decode. Trade-off: slower wake response, but no 'Yes' played on stray loud noises. Set false if you prefer speed over false-wake filtering.",
|
||||
"wake_verify_enabled": true,
|
||||
|
||||
|
||||
"_vad_comment": "Hysteretic VAD with adaptive ambient baseline. speech_entry_rms = 'user started talking' floor; silence_exit_rms = 'user stopped' floor (must be < entry). Adaptive: we reuse the wake detector's baseline (measured during idle silence) to bump both up if the room is noisier than the floors. Tune for YOUR ambient: check `command audio: peak=X rms=Y` in voice.log — speech_entry should sit roughly 2× above ambient rms. Room-measured ambient ≈ 250-350 → entry 400 with margin.",
|
||||
|
||||
@ -1 +0,0 @@
|
||||
[]
|
||||
@ -1,8 +0,0 @@
|
||||
[
|
||||
{
|
||||
"time": "15:29:59",
|
||||
"cmd": "hello, can you hear me",
|
||||
"response": "Hello, can you hear me?",
|
||||
"duration_s": 4.69
|
||||
}
|
||||
]
|
||||
@ -1 +0,0 @@
|
||||
[]
|
||||
@ -1 +0,0 @@
|
||||
{}
|
||||
@ -1 +0,0 @@
|
||||
[]
|
||||
@ -1,8 +0,0 @@
|
||||
[
|
||||
{
|
||||
"time": "15:37:37",
|
||||
"cmd": "turn left",
|
||||
"response": "local command",
|
||||
"duration_s": 0.0
|
||||
}
|
||||
]
|
||||
@ -1 +0,0 @@
|
||||
[]
|
||||
@ -1 +0,0 @@
|
||||
{}
|
||||
@ -1 +0,0 @@
|
||||
[]
|
||||
@ -1,8 +0,0 @@
|
||||
[
|
||||
{
|
||||
"time": "16:22:54",
|
||||
"cmd": "hi",
|
||||
"response": "Hello! I am Sanad. How can I help you?",
|
||||
"duration_s": 0.0
|
||||
}
|
||||
]
|
||||
@ -1 +0,0 @@
|
||||
[]
|
||||
@ -1 +0,0 @@
|
||||
{}
|
||||
@ -1 +0,0 @@
|
||||
[]
|
||||
@ -1,8 +0,0 @@
|
||||
[
|
||||
{
|
||||
"time": "16:24:12",
|
||||
"cmd": "what do you see",
|
||||
"response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.",
|
||||
"duration_s": 0.0
|
||||
}
|
||||
]
|
||||
@ -1 +0,0 @@
|
||||
[]
|
||||
@ -1 +0,0 @@
|
||||
{}
|
||||
BIN
Data/Voice/Recordings/unk_1776999824.wav
Normal file
BIN
Data/Voice/Recordings/unk_1776999824.wav
Normal file
Binary file not shown.
BIN
Data/Voice/Recordings/unk_1777000093.wav
Normal file
BIN
Data/Voice/Recordings/unk_1777000093.wav
Normal file
Binary file not shown.
@ -60,6 +60,30 @@ logging.basicConfig(
|
||||
log = logging.getLogger("marcus_voice")
|
||||
|
||||
|
||||
# ── Transcript log ─────────────────────────────────────────────
|
||||
# Every transcribed utterance (wake or not, command or not) is
|
||||
# written here in a simple one-line-per-entry format so the operator
|
||||
# can scan everything the mic heard without wading through the full
|
||||
# voice.log. Rotates every 5 MB × 3 backups.
|
||||
_TRANSCRIPT_PATH = os.path.join(LOG_DIR, "transcript.log")
|
||||
_transcript_log = logging.getLogger("transcript")
|
||||
_transcript_log.setLevel(logging.INFO)
|
||||
_transcript_log.propagate = False # don't double-emit
|
||||
if not _transcript_log.handlers:
|
||||
_th = RotatingFileHandler(
|
||||
_TRANSCRIPT_PATH, maxBytes=5_000_000, backupCount=3, encoding="utf-8",
|
||||
)
|
||||
_th.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
|
||||
_transcript_log.addHandler(_th)
|
||||
|
||||
|
||||
def _log_transcript(action: str, text: str) -> None:
|
||||
"""Write one line to logs/transcript.log.
|
||||
action: 'HEARD' / 'WAKE' / 'CMD' / 'UNK' / ...
|
||||
"""
|
||||
_transcript_log.info("%-5s %s", action, (text or "").strip())
|
||||
|
||||
|
||||
# Module-level vocabulary containers. EMPTY on import — populated by
|
||||
# VoiceModule.__init__ from Config/config_Voice.json::stt.{wake_words,
|
||||
# command_vocab, garbage_patterns}. Config is the single source of truth;
|
||||
@ -248,13 +272,35 @@ class VoiceModule:
|
||||
if self._mic_gain != 1.0:
|
||||
log.info("mic_gain = %.2fx (applied to all mic reads)", self._mic_gain)
|
||||
|
||||
# ── faster-whisper (lazy-init on first wake) ──
|
||||
# ── STT backend selection ──
|
||||
# "faster_whisper" (default): Whisper base.en int8 on CPU via CTranslate2.
|
||||
# "moonshine": useful-sensors Moonshine via moonshine-voice
|
||||
# + onnxruntime. Different training, different
|
||||
# error profile from Whisper — useful when
|
||||
# Whisper's hallucinations (short "Yes.",
|
||||
# "Bye.", "It.") are the failure mode.
|
||||
self._backend_name = str(self._stt.get("backend", "faster_whisper")).lower()
|
||||
|
||||
# Lazy-init handles — concrete model loads on first wake so startup
|
||||
# stays light. `False` marks a failed init so we don't keep retrying.
|
||||
self._fw = None
|
||||
self._moonshine = None
|
||||
|
||||
# ── Two-turn wake state (always_on_gated mode) ──
|
||||
# self._awaiting_command: False = listening for wake.
|
||||
# True = wake heard, next utterance is
|
||||
# the command. Cleared after the
|
||||
# command dispatches or after
|
||||
# await_command_timeout_sec seconds
|
||||
# so a stray "Sanad" doesn't arm
|
||||
# forever.
|
||||
self._awaiting_command = False
|
||||
self._await_deadline = 0.0
|
||||
|
||||
self._running = False
|
||||
self._thread = None
|
||||
self._cooldown_until = 0.0
|
||||
log.info("VoiceModule initialized (wake=custom, stt=faster-whisper)")
|
||||
log.info("VoiceModule initialized (wake=custom, stt=%s)", self._backend_name)
|
||||
|
||||
# ─── gain-applied mic read ────────────────────────────
|
||||
|
||||
@ -299,6 +345,49 @@ class VoiceModule:
|
||||
self._fw = None
|
||||
return self._fw
|
||||
|
||||
# ─── lazy moonshine init ──────────────────────────────
|
||||
|
||||
def _get_moonshine(self):
|
||||
"""
|
||||
Load Moonshine (useful-sensors) on first use. Requires
|
||||
`pip install moonshine-voice` on the target. Returns None if
|
||||
the package isn't available — caller should fall back.
|
||||
"""
|
||||
if self._moonshine is not None:
|
||||
return self._moonshine if self._moonshine is not False else None
|
||||
lang = self._stt.get("moonshine_language", "en")
|
||||
log.info("Loading Moonshine: language=%s", lang)
|
||||
try:
|
||||
from moonshine_voice import Transcriber
|
||||
from moonshine_voice.download import download_model
|
||||
model_path, model_arch = download_model(language=lang)
|
||||
self._moonshine = Transcriber(
|
||||
model_path=model_path, model_arch=model_arch,
|
||||
)
|
||||
log.info("Moonshine ready: arch=%s", model_arch)
|
||||
except Exception as e:
|
||||
log.error("Moonshine init failed: %s — voice will be wake-only", e)
|
||||
self._moonshine = False
|
||||
return None
|
||||
return self._moonshine
|
||||
|
||||
def _moonshine_decode(self, audio_f32: np.ndarray) -> str:
|
||||
"""Run Moonshine one-shot on a float32 16kHz mono array. Returns ''."""
|
||||
m = self._get_moonshine()
|
||||
if m is None:
|
||||
return ""
|
||||
try:
|
||||
result = m.transcribe_without_streaming(
|
||||
audio_data=audio_f32, sample_rate=self._sample_rate,
|
||||
)
|
||||
lines = getattr(result, "lines", None) or []
|
||||
text = " ".join(getattr(ln, "text", "") for ln in lines).strip()
|
||||
log.info("moonshine: text=%r", text[:80])
|
||||
return text
|
||||
except Exception as e:
|
||||
log.error("moonshine transcribe failed: %s", e)
|
||||
return ""
|
||||
|
||||
# ─── command recording ────────────────────────────────
|
||||
|
||||
def _record_command(self) -> np.ndarray:
|
||||
@ -445,7 +534,10 @@ class VoiceModule:
|
||||
# ─── transcription ────────────────────────────────────
|
||||
|
||||
def _transcribe(self, audio_i16: np.ndarray) -> str:
|
||||
"""int16 PCM → Whisper transcription. Returns '' on no-speech/noise."""
|
||||
"""int16 PCM → STT transcription. Returns '' on no-speech/noise."""
|
||||
if self._backend_name == "moonshine":
|
||||
return self._transcribe_moonshine(audio_i16, lenient=False)
|
||||
|
||||
fw = self._get_fw()
|
||||
if fw is None:
|
||||
return ""
|
||||
@ -596,6 +688,9 @@ class VoiceModule:
|
||||
The downside (no Sanad nudge) is fine here because the acoustic
|
||||
detector has already gated out non-speech.
|
||||
"""
|
||||
if self._backend_name == "moonshine":
|
||||
return self._transcribe_moonshine(audio_i16, lenient=True)
|
||||
|
||||
fw = self._get_fw()
|
||||
if fw is None:
|
||||
return ""
|
||||
@ -634,6 +729,39 @@ class VoiceModule:
|
||||
log.error("whisper-raw transcribe failed: %s", e)
|
||||
return ""
|
||||
|
||||
def _transcribe_moonshine(self, audio_i16: np.ndarray, lenient: bool) -> str:
|
||||
"""
|
||||
Moonshine decode path. Light DSP only (DC-removal + peak-normalize);
|
||||
Moonshine has its own internal feature extraction, and the Whisper-
|
||||
oriented pre-emphasis / 80 Hz HPF are not helpful here.
|
||||
|
||||
lenient=True mirrors _transcribe_raw: skip the garbage-pattern and
|
||||
min-length filters so wake verify can see short /s-/ phonetic signals.
|
||||
lenient=False applies the same rejection pipeline as _transcribe.
|
||||
"""
|
||||
if audio_i16.size == 0:
|
||||
return ""
|
||||
|
||||
audio_f32 = audio_i16.astype(np.float32) / 32768.0
|
||||
audio_f32 = audio_f32 - np.mean(audio_f32)
|
||||
peak = float(np.abs(audio_f32).max())
|
||||
if peak > 1e-4 and peak < 0.7:
|
||||
audio_f32 = audio_f32 * (0.7 / peak)
|
||||
|
||||
text = self._moonshine_decode(audio_f32)
|
||||
if not text:
|
||||
return ""
|
||||
if lenient:
|
||||
return text
|
||||
|
||||
low = text.lower().strip().rstrip(".!?,")
|
||||
vocab_exact = {c.lower() for c in COMMAND_VOCAB}
|
||||
if low in GARBAGE_PATTERNS or len(low) < _MIN_TRANSCRIPTION_LENGTH:
|
||||
if low not in WAKE_WORDS and low not in vocab_exact:
|
||||
log.info("Rejecting likely noise transcription: %r", text)
|
||||
return ""
|
||||
return text
|
||||
|
||||
# ─── command transcription ────────────────────────────
|
||||
|
||||
def _transcribe_command(self, audio_i16: np.ndarray) -> str:
|
||||
@ -925,39 +1053,80 @@ class VoiceModule:
|
||||
text = self._transcribe_command(audio) if audio.size else ""
|
||||
if text:
|
||||
log.info("HEARD: %r", text)
|
||||
# Gated mode: only dispatch if the wake word was
|
||||
# spoken. Everything is still logged above so the
|
||||
# operator has full visibility into what the mic
|
||||
# is picking up.
|
||||
_log_transcript("HEARD", text)
|
||||
|
||||
# ── Two-turn gated flow ────────────────────
|
||||
# State A — listening for wake:
|
||||
# non-wake utterance → log only, do not dispatch
|
||||
# "Sanad <cmd>" → strip + dispatch now
|
||||
# "Sanad" alone → speak "Yes", switch to
|
||||
# state B ("awaiting command")
|
||||
# State B — awaiting command (after Yes):
|
||||
# any utterance → dispatch as the command,
|
||||
# regardless of wake word.
|
||||
# Then back to state A.
|
||||
#
|
||||
# This matches the SanadVoice/gemini_interact
|
||||
# pattern: always transcribe + log every word,
|
||||
# say "yes" on wake, treat the next utterance
|
||||
# as the command.
|
||||
# Timeout stale await-command state
|
||||
if self._awaiting_command and time.time() > self._await_deadline:
|
||||
log.info(" [awaiting-cmd] timed out — back to wake-listen")
|
||||
self._awaiting_command = False
|
||||
|
||||
if self._awaiting_command:
|
||||
# State B — next utterance is the command.
|
||||
command = _strip_wake_word(text) # drop accidental "Sanad,"
|
||||
if not command:
|
||||
command = text # safety: never drop to empty
|
||||
command = self._normalize_command(command)
|
||||
log.info(" [awaiting-cmd] dispatching: %r", command)
|
||||
_log_transcript("CMD", command)
|
||||
self._awaiting_command = False
|
||||
print(f' [Sanad] heard: "{command}"')
|
||||
if self._on_command:
|
||||
try:
|
||||
self._on_command(command, "en")
|
||||
except Exception as e:
|
||||
log.error("on_command: %s", e, exc_info=True)
|
||||
continue
|
||||
|
||||
# State A — listening for wake.
|
||||
if gated and not _has_wake_word(text):
|
||||
log.info(" (no wake word — not dispatched)")
|
||||
else:
|
||||
log.info(" (no wake word — logged only)")
|
||||
_log_transcript("IGN", text)
|
||||
continue
|
||||
|
||||
if gated:
|
||||
command = _strip_wake_word(text)
|
||||
if command != text:
|
||||
log.info(" wake-stripped: %r → %r",
|
||||
text, command)
|
||||
# Bare wake word ("Sanad.", "Sanad") →
|
||||
# speak a "Yes" ack, do NOT call the
|
||||
# brain (it would hallucinate a random
|
||||
# response from a 1-word prompt).
|
||||
if not command:
|
||||
log.info(" wake-only utterance — speaking ack")
|
||||
# Bare "Sanad" — speak "Yes" and arm
|
||||
# the next-utterance-as-command trigger.
|
||||
log.info(" wake heard alone — speaking 'Yes', "
|
||||
"next utterance will be treated as command")
|
||||
_log_transcript("WAKE", text)
|
||||
try:
|
||||
self._audio.speak(
|
||||
self._messages.get("wake_heard", "Yes")
|
||||
)
|
||||
except Exception as e:
|
||||
log.warning("wake-ack TTS failed: %s", e)
|
||||
self._awaiting_command = True
|
||||
self._await_deadline = time.time() + float(
|
||||
self._stt.get("await_command_timeout_sec", 10.0)
|
||||
)
|
||||
continue
|
||||
else:
|
||||
command = text
|
||||
|
||||
# Normalize near-misses ("Turn right up" →
|
||||
# "turn right") so command_parser's regex
|
||||
# fast-path can hit without an LLM round-trip.
|
||||
# Sanad + command in one utterance (e.g.
|
||||
# "Sanad, turn left") → fuzzy-normalize + dispatch.
|
||||
command = self._normalize_command(command)
|
||||
|
||||
_log_transcript("CMD", command)
|
||||
print(f' [Sanad] heard: "{command}"')
|
||||
if self._on_command:
|
||||
try:
|
||||
@ -966,6 +1135,7 @@ class VoiceModule:
|
||||
log.error("on_command: %s", e, exc_info=True)
|
||||
else:
|
||||
log.info("utterance rejected (empty/garbage after Whisper)")
|
||||
_log_transcript("UNK", "(empty)")
|
||||
else:
|
||||
idle_peak_rms = max(idle_peak_rms, rms)
|
||||
idle_sum_rms += rms
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user