Update 2026-04-22 17:54:49

This commit is contained in:
kassam 2026-04-22 17:54:50 +04:00
parent 00e52496a9
commit ce09b6920a
25 changed files with 355 additions and 93 deletions

View File

@ -5,17 +5,20 @@
"target_sample_rate": 16000
},
"stt": {
"wake_model": "tiny",
"command_model": "tiny",
"backend": "vosk",
"vosk_model_path": "Models/vosk-model-small-en-us-0.15",
"wake_words_en": [
"sanad", "sannad", "sanat", "sunnat",
"senad", "sennad", "sanid", "sanud", "sand",
"samad", "sandy", "sanday", "sunday", "synod", "signed", "sent"
"senad", "sennad", "sanid", "sanud",
"samad", "sandy", "sanday", "sunday", "synod", "signed",
"sand", "send", "sent", "set", "seen", "seed",
"then", "than", "that", "step", "stuck",
"said", "sad", "saw", "so", "sir", "sun"
],
"language": "en",
"command_timeout_sec": 10,
"silence_threshold": 500,
"silence_duration_sec": 1.5,
"silence_threshold": 150,
"silence_duration_sec": 2.0,
"max_record_sec": 15
},
"mic": {

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1,8 @@
[
{
"time": "16:22:54",
"cmd": "hi",
"response": "Hello! I am Sanad. How can I help you?",
"duration_s": 0.0
}
]

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1 @@
{}

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1,8 @@
[
{
"time": "16:24:12",
"cmd": "what do you see",
"response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.",
"duration_s": 0.0
}
]

View File

@ -0,0 +1 @@
[]

View File

@ -0,0 +1 @@
{}

Binary file not shown.

View File

@ -0,0 +1,9 @@
US English model for mobile Vosk applications
Copyright 2020 Alpha Cephei Inc
Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
Speed: 0.11xRT (desktop)
Latency: 0.15s (right context)

Binary file not shown.

View File

@ -0,0 +1,7 @@
--sample-frequency=16000
--use-energy=false
--num-mel-bins=40
--num-ceps=40
--low-freq=20
--high-freq=7600
--allow-downsample=true

View File

@ -0,0 +1,10 @@
--min-active=200
--max-active=3000
--beam=10.0
--lattice-beam=2.0
--acoustic-scale=1.0
--frame-subsampling-factor=3
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
--endpoint.rule2.min-trailing-silence=0.5
--endpoint.rule3.min-trailing-silence=0.75
--endpoint.rule4.min-trailing-silence=1.0

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,17 @@
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031

View File

@ -0,0 +1,166 @@
1 nonword
2 begin
3 end
4 internal
5 singleton
6 nonword
7 begin
8 end
9 internal
10 singleton
11 begin
12 end
13 internal
14 singleton
15 begin
16 end
17 internal
18 singleton
19 begin
20 end
21 internal
22 singleton
23 begin
24 end
25 internal
26 singleton
27 begin
28 end
29 internal
30 singleton
31 begin
32 end
33 internal
34 singleton
35 begin
36 end
37 internal
38 singleton
39 begin
40 end
41 internal
42 singleton
43 begin
44 end
45 internal
46 singleton
47 begin
48 end
49 internal
50 singleton
51 begin
52 end
53 internal
54 singleton
55 begin
56 end
57 internal
58 singleton
59 begin
60 end
61 internal
62 singleton
63 begin
64 end
65 internal
66 singleton
67 begin
68 end
69 internal
70 singleton
71 begin
72 end
73 internal
74 singleton
75 begin
76 end
77 internal
78 singleton
79 begin
80 end
81 internal
82 singleton
83 begin
84 end
85 internal
86 singleton
87 begin
88 end
89 internal
90 singleton
91 begin
92 end
93 internal
94 singleton
95 begin
96 end
97 internal
98 singleton
99 begin
100 end
101 internal
102 singleton
103 begin
104 end
105 internal
106 singleton
107 begin
108 end
109 internal
110 singleton
111 begin
112 end
113 internal
114 singleton
115 begin
116 end
117 internal
118 singleton
119 begin
120 end
121 internal
122 singleton
123 begin
124 end
125 internal
126 singleton
127 begin
128 end
129 internal
130 singleton
131 begin
132 end
133 internal
134 singleton
135 begin
136 end
137 internal
138 singleton
139 begin
140 end
141 internal
142 singleton
143 begin
144 end
145 internal
146 singleton
147 begin
148 end
149 internal
150 singleton
151 begin
152 end
153 internal
154 singleton
155 begin
156 end
157 internal
158 singleton
159 begin
160 end
161 internal
162 singleton
163 begin
164 end
165 internal
166 singleton

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,3 @@
[
1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09
1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]

View File

@ -0,0 +1 @@
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh

View File

@ -0,0 +1,2 @@
--left-context=3
--right-context=3

View File

@ -27,6 +27,7 @@ import sys
import threading
import time
from logging.handlers import RotatingFileHandler
from typing import Optional
import numpy as np
@ -87,9 +88,10 @@ class VoiceModule:
self._stt = self._config["stt"]
self._mic = self._config["mic"]
# Whisper models — lazy loaded on first _voice_loop() iteration
self._wake_model = None
self._cmd_model = None
# STT (Vosk) — lazy loaded on first _voice_loop() iteration.
# One Model instance, recognizers are created fresh per-utterance.
self._vosk_model = None
self._KaldiRecognizer = None
# Wake words (English only — built-in TTS doesn't do Arabic)
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
@ -115,29 +117,52 @@ class VoiceModule:
# ─── MODEL LOADING ────────────────────────────────────
def _load_whisper(self):
def _load_stt(self):
"""
Lazy-load Whisper models on CPU.
Load Vosk ASR model. Replaces openai-whisper which produced garbage
(!!!!!!!) on this Jetson's torch-aarch64 install regardless of
audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no
numerical instability, ~10× faster than Whisper base on CPU.
Force device='cpu' regardless of torch.cuda.is_available(). On the
Jetson the torch install sometimes claims CUDA but can't deserialize
to it (aarch64 wheel mismatch), and Whisper's default device-auto
then crashes with:
_pickle.UnpicklingError: Weights only load failed.
Attempting to deserialize object on CUDA device 0
CPU-only inference is plenty fast for Whisper tiny (~80 MB model).
Model path is configured via stt.vosk_model_path (relative to
PROJECT_ROOT, or absolute). Default: the small English model,
which is ~40 MB and plenty for short voice commands.
"""
import whisper
from vosk import Model, KaldiRecognizer, SetLogLevel
SetLogLevel(-1) # silence Vosk's stderr spam
if self._wake_model is None:
log.info("Loading Whisper '%s' for wake word (CPU)...", self._stt["wake_model"])
self._wake_model = whisper.load_model(self._stt["wake_model"], device="cpu")
log.info("Wake model ready")
if self._vosk_model is None:
rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15")
model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel)
if not os.path.isdir(model_path):
raise RuntimeError(
"[Voice] Vosk model not found at " + model_path + "\n"
" Download it on the Jetson:\n"
" cd ~/Marcus/Models\n"
" wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n"
" unzip vosk-model-small-en-us-0.15.zip"
)
log.info("Loading Vosk model: %s", model_path)
self._vosk_model = Model(model_path)
self._KaldiRecognizer = KaldiRecognizer
log.info("Vosk model ready")
if self._cmd_model is None:
log.info("Loading Whisper '%s' for commands (CPU)...", self._stt["command_model"])
self._cmd_model = whisper.load_model(self._stt["command_model"], device="cpu")
log.info("Command model ready")
# NO restricted grammar. Vosk's small English model's lexicon
# doesn't contain "sanad" (it's not an English word), so passing
# it in a restricted grammar makes Vosk drop the word with:
# WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in
# vocabulary: 'sanad'
# and the decoder then only has "[unk]" → never matches
# anything → Transcribed always empty.
#
# Instead: open vocabulary transcription, fuzzy-match against
# the stt.wake_words_en list which contains the English words
# Vosk ACTUALLY hears when you say "sanad" (then, send, sand,
# step, signed, etc.).
self._wake_grammar = None
# Back-compat alias for any caller that still references the old name
_load_whisper = _load_stt
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
@ -189,72 +214,55 @@ class VoiceModule:
# ─── TRANSCRIPTION ────────────────────────────────────
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
"""Transcribe audio using Whisper. Returns text."""
import warnings
import whisper
def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str:
"""
Transcribe audio using Vosk.
# Audio stats — log before transcribe so we can see exactly what
# Whisper is being fed. Useful when wake-word never fires: if
# peak_int16 is always < 500 the mic is too quiet regardless of
# any software gain.
When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`),
Vosk is constrained to that vocabulary only perfect for wake-word
detection where we KNOW the exact word we want to hear. Pass
grammar=None for open-vocabulary transcription (used for commands).
"""
import json as _json
# Audio stats — still useful for "mic is silent" diagnostics.
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
# Convert int16 to float32 [-1, 1]
audio_f32 = audio.astype(np.float32) / 32768.0
# Normalize to ~0.9 peak so Whisper's mel features carry real energy.
# Harmless on already-loud audio. Skip if peak is essentially zero
# (no signal at all) — amplifying pure noise doesn't help.
peak = float(np.abs(audio_f32).max())
if peak > 1e-4 and peak < 0.9:
audio_f32 = audio_f32 * (0.9 / peak)
log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
# Suppress the per-call "Performing inference on CPU when CUDA is
# available" UserWarning. A module-level warnings.filterwarnings()
# doesn't catch it because whisper re-issues the warning every call
# via its own logger path. catch_warnings scoped to this call is
# the clean way.
#
# CRITICAL: temperature=0.0 (greedy, no fallback).
# Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
# 0.8, 1.0) — it retries with higher temperatures when the greedy
# pass misses a quality gate. The retry path calls
# `Categorical(logits=logits / temperature).sample()` which blows
# up on Jetson's torch-aarch64 (logits overflow to inf → softmax
# becomes NaN). Traceback (2026-04-22):
# ValueError: Expected parameter logits ... found invalid values:
# tensor([[nan, nan, nan, ..., nan, nan, nan]])
# The voice thread crashed every 2 s and wake-word never fired.
# Forcing temperature=0.0 stays on the greedy path (argmax), which
# has no Categorical sampler and no numerical instability.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
result = model.transcribe(
audio_f32,
language=self._stt["language"], # None = auto-detect
task=task,
fp16=False,
temperature=0.0, # no fallback — avoids NaN bug
condition_on_previous_text=False, # no accumulated context
)
text = result["text"].strip()
detected_lang = result.get("language", "unknown")
# Filter Whisper's "no phonetic content" degeneration patterns.
# Near-silence or very quiet speech can produce repetitive filler
# like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
# repeated word. Treat anything with < 3 distinct alphanumeric
# characters as silence so the wake-word check doesn't see it.
alnum = ''.join(c.lower() for c in text if c.isalnum())
if not alnum or len(set(alnum)) < 3:
log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
if audio.size == 0:
return ""
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
# Fresh recognizer per utterance. Pass grammar if provided.
if grammar:
rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar)
else:
rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate)
rec.SetWords(False)
# Single-shot: feed the whole utterance in one AcceptWaveform call,
# then take FinalResult. Chunk-based feeding split short "sanad"
# utterances across chunk boundaries and Vosk's decoder often
# refused to commit, returning empty. Single-shot works for every
# voice-assistant example in Vosk's docs.
#
# When FinalResult is empty, also check PartialResult — sometimes
# Vosk heard something but didn't reach a segmentation boundary
# yet. PartialResult still has the text, just not "finalized".
rec.AcceptWaveform(audio.tobytes())
final = _json.loads(rec.FinalResult()).get("text", "").strip()
if not final:
partial = _json.loads(rec.PartialResult()).get("partial", "").strip()
if partial:
final = partial
log.info(" (partial only, no final commit)")
text = final
if not text:
log.info("Transcribed: (empty)")
return ""
log.info("Transcribed: %s", text[:100])
return text
def _check_wake_word(self, text: str) -> bool:
@ -298,23 +306,29 @@ class VoiceModule:
time.sleep(1)
def _do_idle(self):
"""Listen for wake word in 2-second chunks."""
"""Listen for wake word in 4-second chunks. Longer windows give
Vosk's decoder enough context to commit short utterances like a
single 'sanad'."""
# Skip if robot is speaking — prevents self-listening
if self._audio.is_speaking:
time.sleep(0.2)
return
audio = self._record_chunk(2.0)
audio = self._record_chunk(4.0)
# Double-check speaking didn't start during recording
if self._audio.is_speaking:
return
# Skip if too quiet (no one talking)
if audio.std() < 100:
# Skip if too quiet (no one talking). Threshold lowered to 60 to
# match the G1 on-board mic's typical noise floor (std ~30-80 when
# idle, ~150+ when someone speaks). With 100 we were skipping
# quiet "sanad" utterances entirely.
if audio.std() < 60:
return
text = self._transcribe(audio, self._wake_model)
# Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]")
text = self._transcribe(audio, grammar=self._wake_grammar)
if self._check_wake_word(text):
log.info("Wake word detected!")
@ -330,10 +344,18 @@ class VoiceModule:
def _do_wake_heard(self):
"""Record the command until silence."""
# Wait for "Listening..." TTS to finish before recording
# Wait for "Yes" TTS to finish before recording.
while self._audio.is_speaking:
time.sleep(0.1)
# CRITICAL: flush the mic ring buffer. The UDP multicast receiver
# has been accumulating audio continuously (including pre-wake
# silence and the TTS "Yes" that just played back into the mic
# path). Without flush, _record_until_silence() reads the old
# buffered silence instantly, counts 3 silent chunks, and exits
# before the user has started speaking the command.
self._mic_capture.flush()
log.info("Recording command...")
audio = self._record_until_silence()
@ -348,7 +370,7 @@ class VoiceModule:
def _do_processing(self):
"""Transcribe the command and send to brain."""
text = self._transcribe(self._command_audio, self._cmd_model)
text = self._transcribe(self._command_audio)
self._command_audio = None
if not text or len(text.strip()) < 2: