Update 2026-04-22 17:54:49
This commit is contained in:
parent
00e52496a9
commit
ce09b6920a
@ -5,17 +5,20 @@
|
|||||||
"target_sample_rate": 16000
|
"target_sample_rate": 16000
|
||||||
},
|
},
|
||||||
"stt": {
|
"stt": {
|
||||||
"wake_model": "tiny",
|
"backend": "vosk",
|
||||||
"command_model": "tiny",
|
"vosk_model_path": "Models/vosk-model-small-en-us-0.15",
|
||||||
"wake_words_en": [
|
"wake_words_en": [
|
||||||
"sanad", "sannad", "sanat", "sunnat",
|
"sanad", "sannad", "sanat", "sunnat",
|
||||||
"senad", "sennad", "sanid", "sanud", "sand",
|
"senad", "sennad", "sanid", "sanud",
|
||||||
"samad", "sandy", "sanday", "sunday", "synod", "signed", "sent"
|
"samad", "sandy", "sanday", "sunday", "synod", "signed",
|
||||||
|
"sand", "send", "sent", "set", "seen", "seed",
|
||||||
|
"then", "than", "that", "step", "stuck",
|
||||||
|
"said", "sad", "saw", "so", "sir", "sun"
|
||||||
],
|
],
|
||||||
"language": "en",
|
"language": "en",
|
||||||
"command_timeout_sec": 10,
|
"command_timeout_sec": 10,
|
||||||
"silence_threshold": 500,
|
"silence_threshold": 150,
|
||||||
"silence_duration_sec": 1.5,
|
"silence_duration_sec": 2.0,
|
||||||
"max_record_sec": 15
|
"max_record_sec": 15
|
||||||
},
|
},
|
||||||
"mic": {
|
"mic": {
|
||||||
|
|||||||
1
Data/Brain/Sessions/session_003_2026-04-22/alerts.json
Normal file
1
Data/Brain/Sessions/session_003_2026-04-22/alerts.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
8
Data/Brain/Sessions/session_003_2026-04-22/commands.json
Normal file
8
Data/Brain/Sessions/session_003_2026-04-22/commands.json
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"time": "16:22:54",
|
||||||
|
"cmd": "hi",
|
||||||
|
"response": "Hello! I am Sanad. How can I help you?",
|
||||||
|
"duration_s": 0.0
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
1
Data/Brain/Sessions/session_003_2026-04-22/places.json
Normal file
1
Data/Brain/Sessions/session_003_2026-04-22/places.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
1
Data/Brain/Sessions/session_004_2026-04-22/alerts.json
Normal file
1
Data/Brain/Sessions/session_004_2026-04-22/alerts.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
8
Data/Brain/Sessions/session_004_2026-04-22/commands.json
Normal file
8
Data/Brain/Sessions/session_004_2026-04-22/commands.json
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"time": "16:24:12",
|
||||||
|
"cmd": "what do you see",
|
||||||
|
"response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.",
|
||||||
|
"duration_s": 0.0
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1 @@
|
|||||||
|
[]
|
||||||
1
Data/Brain/Sessions/session_004_2026-04-22/places.json
Normal file
1
Data/Brain/Sessions/session_004_2026-04-22/places.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{}
|
||||||
BIN
Models/vosk-model-small-en-us-0.15.zip
Normal file
BIN
Models/vosk-model-small-en-us-0.15.zip
Normal file
Binary file not shown.
9
Models/vosk-model-small-en-us-0.15/README
Normal file
9
Models/vosk-model-small-en-us-0.15/README
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
US English model for mobile Vosk applications
|
||||||
|
|
||||||
|
Copyright 2020 Alpha Cephei Inc
|
||||||
|
|
||||||
|
Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean)
|
||||||
|
Speed: 0.11xRT (desktop)
|
||||||
|
Latency: 0.15s (right context)
|
||||||
|
|
||||||
|
|
||||||
BIN
Models/vosk-model-small-en-us-0.15/am/final.mdl
Normal file
BIN
Models/vosk-model-small-en-us-0.15/am/final.mdl
Normal file
Binary file not shown.
7
Models/vosk-model-small-en-us-0.15/conf/mfcc.conf
Normal file
7
Models/vosk-model-small-en-us-0.15/conf/mfcc.conf
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
--sample-frequency=16000
|
||||||
|
--use-energy=false
|
||||||
|
--num-mel-bins=40
|
||||||
|
--num-ceps=40
|
||||||
|
--low-freq=20
|
||||||
|
--high-freq=7600
|
||||||
|
--allow-downsample=true
|
||||||
10
Models/vosk-model-small-en-us-0.15/conf/model.conf
Normal file
10
Models/vosk-model-small-en-us-0.15/conf/model.conf
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
--min-active=200
|
||||||
|
--max-active=3000
|
||||||
|
--beam=10.0
|
||||||
|
--lattice-beam=2.0
|
||||||
|
--acoustic-scale=1.0
|
||||||
|
--frame-subsampling-factor=3
|
||||||
|
--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10
|
||||||
|
--endpoint.rule2.min-trailing-silence=0.5
|
||||||
|
--endpoint.rule3.min-trailing-silence=0.75
|
||||||
|
--endpoint.rule4.min-trailing-silence=1.0
|
||||||
BIN
Models/vosk-model-small-en-us-0.15/graph/Gr.fst
Normal file
BIN
Models/vosk-model-small-en-us-0.15/graph/Gr.fst
Normal file
Binary file not shown.
BIN
Models/vosk-model-small-en-us-0.15/graph/HCLr.fst
Normal file
BIN
Models/vosk-model-small-en-us-0.15/graph/HCLr.fst
Normal file
Binary file not shown.
17
Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int
Normal file
17
Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
10015
|
||||||
|
10016
|
||||||
|
10017
|
||||||
|
10018
|
||||||
|
10019
|
||||||
|
10020
|
||||||
|
10021
|
||||||
|
10022
|
||||||
|
10023
|
||||||
|
10024
|
||||||
|
10025
|
||||||
|
10026
|
||||||
|
10027
|
||||||
|
10028
|
||||||
|
10029
|
||||||
|
10030
|
||||||
|
10031
|
||||||
@ -0,0 +1,166 @@
|
|||||||
|
1 nonword
|
||||||
|
2 begin
|
||||||
|
3 end
|
||||||
|
4 internal
|
||||||
|
5 singleton
|
||||||
|
6 nonword
|
||||||
|
7 begin
|
||||||
|
8 end
|
||||||
|
9 internal
|
||||||
|
10 singleton
|
||||||
|
11 begin
|
||||||
|
12 end
|
||||||
|
13 internal
|
||||||
|
14 singleton
|
||||||
|
15 begin
|
||||||
|
16 end
|
||||||
|
17 internal
|
||||||
|
18 singleton
|
||||||
|
19 begin
|
||||||
|
20 end
|
||||||
|
21 internal
|
||||||
|
22 singleton
|
||||||
|
23 begin
|
||||||
|
24 end
|
||||||
|
25 internal
|
||||||
|
26 singleton
|
||||||
|
27 begin
|
||||||
|
28 end
|
||||||
|
29 internal
|
||||||
|
30 singleton
|
||||||
|
31 begin
|
||||||
|
32 end
|
||||||
|
33 internal
|
||||||
|
34 singleton
|
||||||
|
35 begin
|
||||||
|
36 end
|
||||||
|
37 internal
|
||||||
|
38 singleton
|
||||||
|
39 begin
|
||||||
|
40 end
|
||||||
|
41 internal
|
||||||
|
42 singleton
|
||||||
|
43 begin
|
||||||
|
44 end
|
||||||
|
45 internal
|
||||||
|
46 singleton
|
||||||
|
47 begin
|
||||||
|
48 end
|
||||||
|
49 internal
|
||||||
|
50 singleton
|
||||||
|
51 begin
|
||||||
|
52 end
|
||||||
|
53 internal
|
||||||
|
54 singleton
|
||||||
|
55 begin
|
||||||
|
56 end
|
||||||
|
57 internal
|
||||||
|
58 singleton
|
||||||
|
59 begin
|
||||||
|
60 end
|
||||||
|
61 internal
|
||||||
|
62 singleton
|
||||||
|
63 begin
|
||||||
|
64 end
|
||||||
|
65 internal
|
||||||
|
66 singleton
|
||||||
|
67 begin
|
||||||
|
68 end
|
||||||
|
69 internal
|
||||||
|
70 singleton
|
||||||
|
71 begin
|
||||||
|
72 end
|
||||||
|
73 internal
|
||||||
|
74 singleton
|
||||||
|
75 begin
|
||||||
|
76 end
|
||||||
|
77 internal
|
||||||
|
78 singleton
|
||||||
|
79 begin
|
||||||
|
80 end
|
||||||
|
81 internal
|
||||||
|
82 singleton
|
||||||
|
83 begin
|
||||||
|
84 end
|
||||||
|
85 internal
|
||||||
|
86 singleton
|
||||||
|
87 begin
|
||||||
|
88 end
|
||||||
|
89 internal
|
||||||
|
90 singleton
|
||||||
|
91 begin
|
||||||
|
92 end
|
||||||
|
93 internal
|
||||||
|
94 singleton
|
||||||
|
95 begin
|
||||||
|
96 end
|
||||||
|
97 internal
|
||||||
|
98 singleton
|
||||||
|
99 begin
|
||||||
|
100 end
|
||||||
|
101 internal
|
||||||
|
102 singleton
|
||||||
|
103 begin
|
||||||
|
104 end
|
||||||
|
105 internal
|
||||||
|
106 singleton
|
||||||
|
107 begin
|
||||||
|
108 end
|
||||||
|
109 internal
|
||||||
|
110 singleton
|
||||||
|
111 begin
|
||||||
|
112 end
|
||||||
|
113 internal
|
||||||
|
114 singleton
|
||||||
|
115 begin
|
||||||
|
116 end
|
||||||
|
117 internal
|
||||||
|
118 singleton
|
||||||
|
119 begin
|
||||||
|
120 end
|
||||||
|
121 internal
|
||||||
|
122 singleton
|
||||||
|
123 begin
|
||||||
|
124 end
|
||||||
|
125 internal
|
||||||
|
126 singleton
|
||||||
|
127 begin
|
||||||
|
128 end
|
||||||
|
129 internal
|
||||||
|
130 singleton
|
||||||
|
131 begin
|
||||||
|
132 end
|
||||||
|
133 internal
|
||||||
|
134 singleton
|
||||||
|
135 begin
|
||||||
|
136 end
|
||||||
|
137 internal
|
||||||
|
138 singleton
|
||||||
|
139 begin
|
||||||
|
140 end
|
||||||
|
141 internal
|
||||||
|
142 singleton
|
||||||
|
143 begin
|
||||||
|
144 end
|
||||||
|
145 internal
|
||||||
|
146 singleton
|
||||||
|
147 begin
|
||||||
|
148 end
|
||||||
|
149 internal
|
||||||
|
150 singleton
|
||||||
|
151 begin
|
||||||
|
152 end
|
||||||
|
153 internal
|
||||||
|
154 singleton
|
||||||
|
155 begin
|
||||||
|
156 end
|
||||||
|
157 internal
|
||||||
|
158 singleton
|
||||||
|
159 begin
|
||||||
|
160 end
|
||||||
|
161 internal
|
||||||
|
162 singleton
|
||||||
|
163 begin
|
||||||
|
164 end
|
||||||
|
165 internal
|
||||||
|
166 singleton
|
||||||
BIN
Models/vosk-model-small-en-us-0.15/ivector/final.dubm
Normal file
BIN
Models/vosk-model-small-en-us-0.15/ivector/final.dubm
Normal file
Binary file not shown.
BIN
Models/vosk-model-small-en-us-0.15/ivector/final.ie
Normal file
BIN
Models/vosk-model-small-en-us-0.15/ivector/final.ie
Normal file
Binary file not shown.
BIN
Models/vosk-model-small-en-us-0.15/ivector/final.mat
Normal file
BIN
Models/vosk-model-small-en-us-0.15/ivector/final.mat
Normal file
Binary file not shown.
@ -0,0 +1,3 @@
|
|||||||
|
[
|
||||||
|
1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09
|
||||||
|
1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ]
|
||||||
@ -0,0 +1 @@
|
|||||||
|
# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
|
||||||
2
Models/vosk-model-small-en-us-0.15/ivector/splice.conf
Normal file
2
Models/vosk-model-small-en-us-0.15/ivector/splice.conf
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
--left-context=3
|
||||||
|
--right-context=3
|
||||||
@ -27,6 +27,7 @@ import sys
|
|||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from logging.handlers import RotatingFileHandler
|
from logging.handlers import RotatingFileHandler
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@ -87,9 +88,10 @@ class VoiceModule:
|
|||||||
self._stt = self._config["stt"]
|
self._stt = self._config["stt"]
|
||||||
self._mic = self._config["mic"]
|
self._mic = self._config["mic"]
|
||||||
|
|
||||||
# Whisper models — lazy loaded on first _voice_loop() iteration
|
# STT (Vosk) — lazy loaded on first _voice_loop() iteration.
|
||||||
self._wake_model = None
|
# One Model instance, recognizers are created fresh per-utterance.
|
||||||
self._cmd_model = None
|
self._vosk_model = None
|
||||||
|
self._KaldiRecognizer = None
|
||||||
|
|
||||||
# Wake words (English only — built-in TTS doesn't do Arabic)
|
# Wake words (English only — built-in TTS doesn't do Arabic)
|
||||||
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
|
self._wake_en = [w.lower() for w in self._stt.get("wake_words_en",
|
||||||
@ -115,29 +117,52 @@ class VoiceModule:
|
|||||||
|
|
||||||
# ─── MODEL LOADING ────────────────────────────────────
|
# ─── MODEL LOADING ────────────────────────────────────
|
||||||
|
|
||||||
def _load_whisper(self):
|
def _load_stt(self):
|
||||||
"""
|
"""
|
||||||
Lazy-load Whisper models on CPU.
|
Load Vosk ASR model. Replaces openai-whisper which produced garbage
|
||||||
|
(!!!!!!!) on this Jetson's torch-aarch64 install regardless of
|
||||||
|
audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no
|
||||||
|
numerical instability, ~10× faster than Whisper base on CPU.
|
||||||
|
|
||||||
Force device='cpu' regardless of torch.cuda.is_available(). On the
|
Model path is configured via stt.vosk_model_path (relative to
|
||||||
Jetson the torch install sometimes claims CUDA but can't deserialize
|
PROJECT_ROOT, or absolute). Default: the small English model,
|
||||||
to it (aarch64 wheel mismatch), and Whisper's default device-auto
|
which is ~40 MB and plenty for short voice commands.
|
||||||
then crashes with:
|
|
||||||
_pickle.UnpicklingError: Weights only load failed.
|
|
||||||
Attempting to deserialize object on CUDA device 0
|
|
||||||
CPU-only inference is plenty fast for Whisper tiny (~80 MB model).
|
|
||||||
"""
|
"""
|
||||||
import whisper
|
from vosk import Model, KaldiRecognizer, SetLogLevel
|
||||||
|
SetLogLevel(-1) # silence Vosk's stderr spam
|
||||||
|
|
||||||
if self._wake_model is None:
|
if self._vosk_model is None:
|
||||||
log.info("Loading Whisper '%s' for wake word (CPU)...", self._stt["wake_model"])
|
rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15")
|
||||||
self._wake_model = whisper.load_model(self._stt["wake_model"], device="cpu")
|
model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel)
|
||||||
log.info("Wake model ready")
|
if not os.path.isdir(model_path):
|
||||||
|
raise RuntimeError(
|
||||||
|
"[Voice] Vosk model not found at " + model_path + "\n"
|
||||||
|
" Download it on the Jetson:\n"
|
||||||
|
" cd ~/Marcus/Models\n"
|
||||||
|
" wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n"
|
||||||
|
" unzip vosk-model-small-en-us-0.15.zip"
|
||||||
|
)
|
||||||
|
log.info("Loading Vosk model: %s", model_path)
|
||||||
|
self._vosk_model = Model(model_path)
|
||||||
|
self._KaldiRecognizer = KaldiRecognizer
|
||||||
|
log.info("Vosk model ready")
|
||||||
|
|
||||||
if self._cmd_model is None:
|
# NO restricted grammar. Vosk's small English model's lexicon
|
||||||
log.info("Loading Whisper '%s' for commands (CPU)...", self._stt["command_model"])
|
# doesn't contain "sanad" (it's not an English word), so passing
|
||||||
self._cmd_model = whisper.load_model(self._stt["command_model"], device="cpu")
|
# it in a restricted grammar makes Vosk drop the word with:
|
||||||
log.info("Command model ready")
|
# WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in
|
||||||
|
# vocabulary: 'sanad'
|
||||||
|
# and the decoder then only has "[unk]" → never matches
|
||||||
|
# anything → Transcribed always empty.
|
||||||
|
#
|
||||||
|
# Instead: open vocabulary transcription, fuzzy-match against
|
||||||
|
# the stt.wake_words_en list which contains the English words
|
||||||
|
# Vosk ACTUALLY hears when you say "sanad" (then, send, sand,
|
||||||
|
# step, signed, etc.).
|
||||||
|
self._wake_grammar = None
|
||||||
|
|
||||||
|
# Back-compat alias for any caller that still references the old name
|
||||||
|
_load_whisper = _load_stt
|
||||||
|
|
||||||
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
|
# ─── MIC RECORDING (G1 built-in UDP) ──────────────────
|
||||||
|
|
||||||
@ -189,72 +214,55 @@ class VoiceModule:
|
|||||||
|
|
||||||
# ─── TRANSCRIPTION ────────────────────────────────────
|
# ─── TRANSCRIPTION ────────────────────────────────────
|
||||||
|
|
||||||
def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str:
|
def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str:
|
||||||
"""Transcribe audio using Whisper. Returns text."""
|
"""
|
||||||
import warnings
|
Transcribe audio using Vosk.
|
||||||
import whisper
|
|
||||||
|
|
||||||
# Audio stats — log before transcribe so we can see exactly what
|
When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`),
|
||||||
# Whisper is being fed. Useful when wake-word never fires: if
|
Vosk is constrained to that vocabulary only — perfect for wake-word
|
||||||
# peak_int16 is always < 500 the mic is too quiet regardless of
|
detection where we KNOW the exact word we want to hear. Pass
|
||||||
# any software gain.
|
grammar=None for open-vocabulary transcription (used for commands).
|
||||||
|
"""
|
||||||
|
import json as _json
|
||||||
|
|
||||||
|
# Audio stats — still useful for "mic is silent" diagnostics.
|
||||||
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
|
peak_i16 = int(np.abs(audio).max()) if audio.size else 0
|
||||||
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
|
rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0
|
||||||
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
|
log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16)
|
||||||
|
|
||||||
# Convert int16 to float32 [-1, 1]
|
if audio.size == 0:
|
||||||
audio_f32 = audio.astype(np.float32) / 32768.0
|
|
||||||
|
|
||||||
# Normalize to ~0.9 peak so Whisper's mel features carry real energy.
|
|
||||||
# Harmless on already-loud audio. Skip if peak is essentially zero
|
|
||||||
# (no signal at all) — amplifying pure noise doesn't help.
|
|
||||||
peak = float(np.abs(audio_f32).max())
|
|
||||||
if peak > 1e-4 and peak < 0.9:
|
|
||||||
audio_f32 = audio_f32 * (0.9 / peak)
|
|
||||||
log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak)
|
|
||||||
|
|
||||||
# Suppress the per-call "Performing inference on CPU when CUDA is
|
|
||||||
# available" UserWarning. A module-level warnings.filterwarnings()
|
|
||||||
# doesn't catch it because whisper re-issues the warning every call
|
|
||||||
# via its own logger path. catch_warnings scoped to this call is
|
|
||||||
# the clean way.
|
|
||||||
#
|
|
||||||
# CRITICAL: temperature=0.0 (greedy, no fallback).
|
|
||||||
# Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6,
|
|
||||||
# 0.8, 1.0) — it retries with higher temperatures when the greedy
|
|
||||||
# pass misses a quality gate. The retry path calls
|
|
||||||
# `Categorical(logits=logits / temperature).sample()` which blows
|
|
||||||
# up on Jetson's torch-aarch64 (logits overflow to inf → softmax
|
|
||||||
# becomes NaN). Traceback (2026-04-22):
|
|
||||||
# ValueError: Expected parameter logits ... found invalid values:
|
|
||||||
# tensor([[nan, nan, nan, ..., nan, nan, nan]])
|
|
||||||
# The voice thread crashed every 2 s and wake-word never fired.
|
|
||||||
# Forcing temperature=0.0 stays on the greedy path (argmax), which
|
|
||||||
# has no Categorical sampler and no numerical instability.
|
|
||||||
with warnings.catch_warnings():
|
|
||||||
warnings.simplefilter("ignore")
|
|
||||||
result = model.transcribe(
|
|
||||||
audio_f32,
|
|
||||||
language=self._stt["language"], # None = auto-detect
|
|
||||||
task=task,
|
|
||||||
fp16=False,
|
|
||||||
temperature=0.0, # no fallback — avoids NaN bug
|
|
||||||
condition_on_previous_text=False, # no accumulated context
|
|
||||||
)
|
|
||||||
text = result["text"].strip()
|
|
||||||
detected_lang = result.get("language", "unknown")
|
|
||||||
|
|
||||||
# Filter Whisper's "no phonetic content" degeneration patterns.
|
|
||||||
# Near-silence or very quiet speech can produce repetitive filler
|
|
||||||
# like "!!!!!!!!!", ". . . . .", "... ... ...", or a single
|
|
||||||
# repeated word. Treat anything with < 3 distinct alphanumeric
|
|
||||||
# characters as silence so the wake-word check doesn't see it.
|
|
||||||
alnum = ''.join(c.lower() for c in text if c.isalnum())
|
|
||||||
if not alnum or len(set(alnum)) < 3:
|
|
||||||
log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60])
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
log.info("Transcribed [%s]: %s", detected_lang, text[:100])
|
# Fresh recognizer per utterance. Pass grammar if provided.
|
||||||
|
if grammar:
|
||||||
|
rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar)
|
||||||
|
else:
|
||||||
|
rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate)
|
||||||
|
rec.SetWords(False)
|
||||||
|
|
||||||
|
# Single-shot: feed the whole utterance in one AcceptWaveform call,
|
||||||
|
# then take FinalResult. Chunk-based feeding split short "sanad"
|
||||||
|
# utterances across chunk boundaries and Vosk's decoder often
|
||||||
|
# refused to commit, returning empty. Single-shot works for every
|
||||||
|
# voice-assistant example in Vosk's docs.
|
||||||
|
#
|
||||||
|
# When FinalResult is empty, also check PartialResult — sometimes
|
||||||
|
# Vosk heard something but didn't reach a segmentation boundary
|
||||||
|
# yet. PartialResult still has the text, just not "finalized".
|
||||||
|
rec.AcceptWaveform(audio.tobytes())
|
||||||
|
final = _json.loads(rec.FinalResult()).get("text", "").strip()
|
||||||
|
if not final:
|
||||||
|
partial = _json.loads(rec.PartialResult()).get("partial", "").strip()
|
||||||
|
if partial:
|
||||||
|
final = partial
|
||||||
|
log.info(" (partial only, no final commit)")
|
||||||
|
text = final
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
log.info("Transcribed: (empty)")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
log.info("Transcribed: %s", text[:100])
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _check_wake_word(self, text: str) -> bool:
|
def _check_wake_word(self, text: str) -> bool:
|
||||||
@ -298,23 +306,29 @@ class VoiceModule:
|
|||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
def _do_idle(self):
|
def _do_idle(self):
|
||||||
"""Listen for wake word in 2-second chunks."""
|
"""Listen for wake word in 4-second chunks. Longer windows give
|
||||||
|
Vosk's decoder enough context to commit short utterances like a
|
||||||
|
single 'sanad'."""
|
||||||
# Skip if robot is speaking — prevents self-listening
|
# Skip if robot is speaking — prevents self-listening
|
||||||
if self._audio.is_speaking:
|
if self._audio.is_speaking:
|
||||||
time.sleep(0.2)
|
time.sleep(0.2)
|
||||||
return
|
return
|
||||||
|
|
||||||
audio = self._record_chunk(2.0)
|
audio = self._record_chunk(4.0)
|
||||||
|
|
||||||
# Double-check speaking didn't start during recording
|
# Double-check speaking didn't start during recording
|
||||||
if self._audio.is_speaking:
|
if self._audio.is_speaking:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Skip if too quiet (no one talking)
|
# Skip if too quiet (no one talking). Threshold lowered to 60 to
|
||||||
if audio.std() < 100:
|
# match the G1 on-board mic's typical noise floor (std ~30-80 when
|
||||||
|
# idle, ~150+ when someone speaks). With 100 we were skipping
|
||||||
|
# quiet "sanad" utterances entirely.
|
||||||
|
if audio.std() < 60:
|
||||||
return
|
return
|
||||||
|
|
||||||
text = self._transcribe(audio, self._wake_model)
|
# Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]")
|
||||||
|
text = self._transcribe(audio, grammar=self._wake_grammar)
|
||||||
|
|
||||||
if self._check_wake_word(text):
|
if self._check_wake_word(text):
|
||||||
log.info("Wake word detected!")
|
log.info("Wake word detected!")
|
||||||
@ -330,10 +344,18 @@ class VoiceModule:
|
|||||||
|
|
||||||
def _do_wake_heard(self):
|
def _do_wake_heard(self):
|
||||||
"""Record the command until silence."""
|
"""Record the command until silence."""
|
||||||
# Wait for "Listening..." TTS to finish before recording
|
# Wait for "Yes" TTS to finish before recording.
|
||||||
while self._audio.is_speaking:
|
while self._audio.is_speaking:
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
# CRITICAL: flush the mic ring buffer. The UDP multicast receiver
|
||||||
|
# has been accumulating audio continuously (including pre-wake
|
||||||
|
# silence and the TTS "Yes" that just played back into the mic
|
||||||
|
# path). Without flush, _record_until_silence() reads the old
|
||||||
|
# buffered silence instantly, counts 3 silent chunks, and exits
|
||||||
|
# before the user has started speaking the command.
|
||||||
|
self._mic_capture.flush()
|
||||||
|
|
||||||
log.info("Recording command...")
|
log.info("Recording command...")
|
||||||
audio = self._record_until_silence()
|
audio = self._record_until_silence()
|
||||||
|
|
||||||
@ -348,7 +370,7 @@ class VoiceModule:
|
|||||||
|
|
||||||
def _do_processing(self):
|
def _do_processing(self):
|
||||||
"""Transcribe the command and send to brain."""
|
"""Transcribe the command and send to brain."""
|
||||||
text = self._transcribe(self._command_audio, self._cmd_model)
|
text = self._transcribe(self._command_audio)
|
||||||
self._command_audio = None
|
self._command_audio = None
|
||||||
|
|
||||||
if not text or len(text.strip()) < 2:
|
if not text or len(text.strip()) < 2:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user