diff --git a/Config/config_Voice.json b/Config/config_Voice.json index e25bf0c..beed019 100644 --- a/Config/config_Voice.json +++ b/Config/config_Voice.json @@ -5,17 +5,20 @@ "target_sample_rate": 16000 }, "stt": { - "wake_model": "tiny", - "command_model": "tiny", + "backend": "vosk", + "vosk_model_path": "Models/vosk-model-small-en-us-0.15", "wake_words_en": [ "sanad", "sannad", "sanat", "sunnat", - "senad", "sennad", "sanid", "sanud", "sand", - "samad", "sandy", "sanday", "sunday", "synod", "signed", "sent" + "senad", "sennad", "sanid", "sanud", + "samad", "sandy", "sanday", "sunday", "synod", "signed", + "sand", "send", "sent", "set", "seen", "seed", + "then", "than", "that", "step", "stuck", + "said", "sad", "saw", "so", "sir", "sun" ], "language": "en", "command_timeout_sec": 10, - "silence_threshold": 500, - "silence_duration_sec": 1.5, + "silence_threshold": 150, + "silence_duration_sec": 2.0, "max_record_sec": 15 }, "mic": { diff --git a/Data/Brain/Sessions/session_003_2026-04-22/alerts.json b/Data/Brain/Sessions/session_003_2026-04-22/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_003_2026-04-22/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-22/commands.json b/Data/Brain/Sessions/session_003_2026-04-22/commands.json new file mode 100644 index 0000000..6f3c242 --- /dev/null +++ b/Data/Brain/Sessions/session_003_2026-04-22/commands.json @@ -0,0 +1,8 @@ +[ + { + "time": "16:22:54", + "cmd": "hi", + "response": "Hello! I am Sanad. How can I help you?", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-22/detections.json b/Data/Brain/Sessions/session_003_2026-04-22/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_003_2026-04-22/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_003_2026-04-22/places.json b/Data/Brain/Sessions/session_003_2026-04-22/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_003_2026-04-22/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-22/alerts.json b/Data/Brain/Sessions/session_004_2026-04-22/alerts.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_004_2026-04-22/alerts.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-22/commands.json b/Data/Brain/Sessions/session_004_2026-04-22/commands.json new file mode 100644 index 0000000..67b8bc6 --- /dev/null +++ b/Data/Brain/Sessions/session_004_2026-04-22/commands.json @@ -0,0 +1,8 @@ +[ + { + "time": "16:24:12", + "cmd": "what do you see", + "response": "I see a man sitting in front of a desk with a backpack and a helmet on it. There is also a red and white object hanging on the wall behind him.", + "duration_s": 0.0 + } +] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-22/detections.json b/Data/Brain/Sessions/session_004_2026-04-22/detections.json new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/Data/Brain/Sessions/session_004_2026-04-22/detections.json @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/Data/Brain/Sessions/session_004_2026-04-22/places.json b/Data/Brain/Sessions/session_004_2026-04-22/places.json new file mode 100644 index 0000000..9e26dfe --- /dev/null +++ b/Data/Brain/Sessions/session_004_2026-04-22/places.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/Models/vosk-model-small-en-us-0.15.zip b/Models/vosk-model-small-en-us-0.15.zip new file mode 100644 index 0000000..0c94ec8 Binary files /dev/null and b/Models/vosk-model-small-en-us-0.15.zip differ diff --git a/Models/vosk-model-small-en-us-0.15/README b/Models/vosk-model-small-en-us-0.15/README new file mode 100644 index 0000000..a7f7931 --- /dev/null +++ b/Models/vosk-model-small-en-us-0.15/README @@ -0,0 +1,9 @@ +US English model for mobile Vosk applications + +Copyright 2020 Alpha Cephei Inc + +Accuracy: 10.38 (tedlium test) 9.85 (librispeech test-clean) +Speed: 0.11xRT (desktop) +Latency: 0.15s (right context) + + diff --git a/Models/vosk-model-small-en-us-0.15/am/final.mdl b/Models/vosk-model-small-en-us-0.15/am/final.mdl new file mode 100644 index 0000000..5596b31 Binary files /dev/null and b/Models/vosk-model-small-en-us-0.15/am/final.mdl differ diff --git a/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf b/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf new file mode 100644 index 0000000..eaa40c5 --- /dev/null +++ b/Models/vosk-model-small-en-us-0.15/conf/mfcc.conf @@ -0,0 +1,7 @@ +--sample-frequency=16000 +--use-energy=false +--num-mel-bins=40 +--num-ceps=40 +--low-freq=20 +--high-freq=7600 +--allow-downsample=true diff --git a/Models/vosk-model-small-en-us-0.15/conf/model.conf b/Models/vosk-model-small-en-us-0.15/conf/model.conf new file mode 100644 index 0000000..9d5b0da --- /dev/null +++ b/Models/vosk-model-small-en-us-0.15/conf/model.conf @@ -0,0 +1,10 @@ +--min-active=200 +--max-active=3000 +--beam=10.0 +--lattice-beam=2.0 +--acoustic-scale=1.0 +--frame-subsampling-factor=3 +--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10 +--endpoint.rule2.min-trailing-silence=0.5 +--endpoint.rule3.min-trailing-silence=0.75 +--endpoint.rule4.min-trailing-silence=1.0 diff --git a/Models/vosk-model-small-en-us-0.15/graph/Gr.fst b/Models/vosk-model-small-en-us-0.15/graph/Gr.fst new file mode 100644 index 0000000..1f292e6 Binary files /dev/null and b/Models/vosk-model-small-en-us-0.15/graph/Gr.fst differ diff --git a/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst b/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst new file mode 100644 index 0000000..9797b26 Binary files /dev/null and b/Models/vosk-model-small-en-us-0.15/graph/HCLr.fst differ diff --git a/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int b/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int new file mode 100644 index 0000000..762fd5f --- /dev/null +++ b/Models/vosk-model-small-en-us-0.15/graph/disambig_tid.int @@ -0,0 +1,17 @@ +10015 +10016 +10017 +10018 +10019 +10020 +10021 +10022 +10023 +10024 +10025 +10026 +10027 +10028 +10029 +10030 +10031 diff --git a/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int b/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int new file mode 100644 index 0000000..df23fd7 --- /dev/null +++ b/Models/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int @@ -0,0 +1,166 @@ +1 nonword +2 begin +3 end +4 internal +5 singleton +6 nonword +7 begin +8 end +9 internal +10 singleton +11 begin +12 end +13 internal +14 singleton +15 begin +16 end +17 internal +18 singleton +19 begin +20 end +21 internal +22 singleton +23 begin +24 end +25 internal +26 singleton +27 begin +28 end +29 internal +30 singleton +31 begin +32 end +33 internal +34 singleton +35 begin +36 end +37 internal +38 singleton +39 begin +40 end +41 internal +42 singleton +43 begin +44 end +45 internal +46 singleton +47 begin +48 end +49 internal +50 singleton +51 begin +52 end +53 internal +54 singleton +55 begin +56 end +57 internal +58 singleton +59 begin +60 end +61 internal +62 singleton +63 begin +64 end +65 internal +66 singleton +67 begin +68 end +69 internal +70 singleton +71 begin +72 end +73 internal +74 singleton +75 begin +76 end +77 internal +78 singleton +79 begin +80 end +81 internal +82 singleton +83 begin +84 end +85 internal +86 singleton +87 begin +88 end +89 internal +90 singleton +91 begin +92 end +93 internal +94 singleton +95 begin +96 end +97 internal +98 singleton +99 begin +100 end +101 internal +102 singleton +103 begin +104 end +105 internal +106 singleton +107 begin +108 end +109 internal +110 singleton +111 begin +112 end +113 internal +114 singleton +115 begin +116 end +117 internal +118 singleton +119 begin +120 end +121 internal +122 singleton +123 begin +124 end +125 internal +126 singleton +127 begin +128 end +129 internal +130 singleton +131 begin +132 end +133 internal +134 singleton +135 begin +136 end +137 internal +138 singleton +139 begin +140 end +141 internal +142 singleton +143 begin +144 end +145 internal +146 singleton +147 begin +148 end +149 internal +150 singleton +151 begin +152 end +153 internal +154 singleton +155 begin +156 end +157 internal +158 singleton +159 begin +160 end +161 internal +162 singleton +163 begin +164 end +165 internal +166 singleton diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.dubm b/Models/vosk-model-small-en-us-0.15/ivector/final.dubm new file mode 100644 index 0000000..db789eb Binary files /dev/null and b/Models/vosk-model-small-en-us-0.15/ivector/final.dubm differ diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.ie b/Models/vosk-model-small-en-us-0.15/ivector/final.ie new file mode 100644 index 0000000..93737bf Binary files /dev/null and b/Models/vosk-model-small-en-us-0.15/ivector/final.ie differ diff --git a/Models/vosk-model-small-en-us-0.15/ivector/final.mat b/Models/vosk-model-small-en-us-0.15/ivector/final.mat new file mode 100644 index 0000000..c3ec635 Binary files /dev/null and b/Models/vosk-model-small-en-us-0.15/ivector/final.mat differ diff --git a/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats b/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats new file mode 100644 index 0000000..b9d92ef --- /dev/null +++ b/Models/vosk-model-small-en-us-0.15/ivector/global_cmvn.stats @@ -0,0 +1,3 @@ + [ + 1.682383e+11 -1.1595e+10 -1.521733e+10 4.32034e+09 -2.257938e+10 -1.969666e+10 -2.559265e+10 -1.535687e+10 -1.276854e+10 -4.494483e+09 -1.209085e+10 -5.64008e+09 -1.134847e+10 -3.419512e+09 -1.079542e+10 -4.145463e+09 -6.637486e+09 -1.11318e+09 -3.479773e+09 -1.245932e+08 -1.386961e+09 6.560655e+07 -2.436518e+08 -4.032432e+07 4.620046e+08 -7.714964e+07 9.551484e+08 -4.119761e+08 8.208582e+08 -7.117156e+08 7.457703e+08 -4.3106e+08 1.202726e+09 2.904036e+08 1.231931e+09 3.629848e+08 6.366939e+08 -4.586172e+08 -5.267629e+08 -3.507819e+08 1.679838e+09 + 1.741141e+13 8.92488e+11 8.743834e+11 8.848896e+11 1.190313e+12 1.160279e+12 1.300066e+12 1.005678e+12 9.39335e+11 8.089614e+11 7.927041e+11 6.882427e+11 6.444235e+11 5.151451e+11 4.825723e+11 3.210106e+11 2.720254e+11 1.772539e+11 1.248102e+11 6.691599e+10 3.599804e+10 1.207574e+10 1.679301e+09 4.594778e+08 5.821614e+09 1.451758e+10 2.55803e+10 3.43277e+10 4.245286e+10 4.784859e+10 4.988591e+10 4.925451e+10 5.074584e+10 4.9557e+10 4.407876e+10 3.421443e+10 3.138606e+10 2.539716e+10 1.948134e+10 1.381167e+10 0 ] diff --git a/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf b/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf new file mode 100644 index 0000000..7748a4a --- /dev/null +++ b/Models/vosk-model-small-en-us-0.15/ivector/online_cmvn.conf @@ -0,0 +1 @@ +# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh diff --git a/Models/vosk-model-small-en-us-0.15/ivector/splice.conf b/Models/vosk-model-small-en-us-0.15/ivector/splice.conf new file mode 100644 index 0000000..960cd2e --- /dev/null +++ b/Models/vosk-model-small-en-us-0.15/ivector/splice.conf @@ -0,0 +1,2 @@ +--left-context=3 +--right-context=3 diff --git a/Voice/marcus_voice.py b/Voice/marcus_voice.py index c59d49a..afa58e3 100644 --- a/Voice/marcus_voice.py +++ b/Voice/marcus_voice.py @@ -27,6 +27,7 @@ import sys import threading import time from logging.handlers import RotatingFileHandler +from typing import Optional import numpy as np @@ -87,9 +88,10 @@ class VoiceModule: self._stt = self._config["stt"] self._mic = self._config["mic"] - # Whisper models — lazy loaded on first _voice_loop() iteration - self._wake_model = None - self._cmd_model = None + # STT (Vosk) — lazy loaded on first _voice_loop() iteration. + # One Model instance, recognizers are created fresh per-utterance. + self._vosk_model = None + self._KaldiRecognizer = None # Wake words (English only — built-in TTS doesn't do Arabic) self._wake_en = [w.lower() for w in self._stt.get("wake_words_en", @@ -115,29 +117,52 @@ class VoiceModule: # ─── MODEL LOADING ──────────────────────────────────── - def _load_whisper(self): + def _load_stt(self): """ - Lazy-load Whisper models on CPU. + Load Vosk ASR model. Replaces openai-whisper which produced garbage + (!!!!!!!) on this Jetson's torch-aarch64 install regardless of + audio quality. Vosk uses Kaldi's own CPU kernels — no torch, no + numerical instability, ~10× faster than Whisper base on CPU. - Force device='cpu' regardless of torch.cuda.is_available(). On the - Jetson the torch install sometimes claims CUDA but can't deserialize - to it (aarch64 wheel mismatch), and Whisper's default device-auto - then crashes with: - _pickle.UnpicklingError: Weights only load failed. - Attempting to deserialize object on CUDA device 0 - CPU-only inference is plenty fast for Whisper tiny (~80 MB model). + Model path is configured via stt.vosk_model_path (relative to + PROJECT_ROOT, or absolute). Default: the small English model, + which is ~40 MB and plenty for short voice commands. """ - import whisper + from vosk import Model, KaldiRecognizer, SetLogLevel + SetLogLevel(-1) # silence Vosk's stderr spam - if self._wake_model is None: - log.info("Loading Whisper '%s' for wake word (CPU)...", self._stt["wake_model"]) - self._wake_model = whisper.load_model(self._stt["wake_model"], device="cpu") - log.info("Wake model ready") + if self._vosk_model is None: + rel = self._stt.get("vosk_model_path", "Models/vosk-model-small-en-us-0.15") + model_path = rel if os.path.isabs(rel) else os.path.join(PROJECT_ROOT, rel) + if not os.path.isdir(model_path): + raise RuntimeError( + "[Voice] Vosk model not found at " + model_path + "\n" + " Download it on the Jetson:\n" + " cd ~/Marcus/Models\n" + " wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip\n" + " unzip vosk-model-small-en-us-0.15.zip" + ) + log.info("Loading Vosk model: %s", model_path) + self._vosk_model = Model(model_path) + self._KaldiRecognizer = KaldiRecognizer + log.info("Vosk model ready") - if self._cmd_model is None: - log.info("Loading Whisper '%s' for commands (CPU)...", self._stt["command_model"]) - self._cmd_model = whisper.load_model(self._stt["command_model"], device="cpu") - log.info("Command model ready") + # NO restricted grammar. Vosk's small English model's lexicon + # doesn't contain "sanad" (it's not an English word), so passing + # it in a restricted grammar makes Vosk drop the word with: + # WARNING (VoskAPI:UpdateGrammarFst) Ignoring word missing in + # vocabulary: 'sanad' + # and the decoder then only has "[unk]" → never matches + # anything → Transcribed always empty. + # + # Instead: open vocabulary transcription, fuzzy-match against + # the stt.wake_words_en list which contains the English words + # Vosk ACTUALLY hears when you say "sanad" (then, send, sand, + # step, signed, etc.). + self._wake_grammar = None + + # Back-compat alias for any caller that still references the old name + _load_whisper = _load_stt # ─── MIC RECORDING (G1 built-in UDP) ────────────────── @@ -189,72 +214,55 @@ class VoiceModule: # ─── TRANSCRIPTION ──────────────────────────────────── - def _transcribe(self, audio: np.ndarray, model, task: str = "transcribe") -> str: - """Transcribe audio using Whisper. Returns text.""" - import warnings - import whisper + def _transcribe(self, audio: np.ndarray, grammar: Optional[str] = None) -> str: + """ + Transcribe audio using Vosk. - # Audio stats — log before transcribe so we can see exactly what - # Whisper is being fed. Useful when wake-word never fires: if - # peak_int16 is always < 500 the mic is too quiet regardless of - # any software gain. + When `grammar` is a JSON list string (e.g. `'["sanad","[unk]"]'`), + Vosk is constrained to that vocabulary only — perfect for wake-word + detection where we KNOW the exact word we want to hear. Pass + grammar=None for open-vocabulary transcription (used for commands). + """ + import json as _json + + # Audio stats — still useful for "mic is silent" diagnostics. peak_i16 = int(np.abs(audio).max()) if audio.size else 0 rms_i16 = float(np.sqrt(np.mean(audio.astype(np.float64) ** 2))) if audio.size else 0.0 log.info("audio stats: samples=%d peak=%d rms=%.1f", audio.size, peak_i16, rms_i16) - # Convert int16 to float32 [-1, 1] - audio_f32 = audio.astype(np.float32) / 32768.0 - - # Normalize to ~0.9 peak so Whisper's mel features carry real energy. - # Harmless on already-loud audio. Skip if peak is essentially zero - # (no signal at all) — amplifying pure noise doesn't help. - peak = float(np.abs(audio_f32).max()) - if peak > 1e-4 and peak < 0.9: - audio_f32 = audio_f32 * (0.9 / peak) - log.info("normalized ×%.1f (peak %.4f → 0.9)", 0.9 / peak, peak) - - # Suppress the per-call "Performing inference on CPU when CUDA is - # available" UserWarning. A module-level warnings.filterwarnings() - # doesn't catch it because whisper re-issues the warning every call - # via its own logger path. catch_warnings scoped to this call is - # the clean way. - # - # CRITICAL: temperature=0.0 (greedy, no fallback). - # Whisper's default temperature schedule is (0.0, 0.2, 0.4, 0.6, - # 0.8, 1.0) — it retries with higher temperatures when the greedy - # pass misses a quality gate. The retry path calls - # `Categorical(logits=logits / temperature).sample()` which blows - # up on Jetson's torch-aarch64 (logits overflow to inf → softmax - # becomes NaN). Traceback (2026-04-22): - # ValueError: Expected parameter logits ... found invalid values: - # tensor([[nan, nan, nan, ..., nan, nan, nan]]) - # The voice thread crashed every 2 s and wake-word never fired. - # Forcing temperature=0.0 stays on the greedy path (argmax), which - # has no Categorical sampler and no numerical instability. - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - result = model.transcribe( - audio_f32, - language=self._stt["language"], # None = auto-detect - task=task, - fp16=False, - temperature=0.0, # no fallback — avoids NaN bug - condition_on_previous_text=False, # no accumulated context - ) - text = result["text"].strip() - detected_lang = result.get("language", "unknown") - - # Filter Whisper's "no phonetic content" degeneration patterns. - # Near-silence or very quiet speech can produce repetitive filler - # like "!!!!!!!!!", ". . . . .", "... ... ...", or a single - # repeated word. Treat anything with < 3 distinct alphanumeric - # characters as silence so the wake-word check doesn't see it. - alnum = ''.join(c.lower() for c in text if c.isalnum()) - if not alnum or len(set(alnum)) < 3: - log.info("Transcribed [%s]: (filtered as noise: %r)", detected_lang, text[:60]) + if audio.size == 0: return "" - log.info("Transcribed [%s]: %s", detected_lang, text[:100]) + # Fresh recognizer per utterance. Pass grammar if provided. + if grammar: + rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate, grammar) + else: + rec = self._KaldiRecognizer(self._vosk_model, self._sample_rate) + rec.SetWords(False) + + # Single-shot: feed the whole utterance in one AcceptWaveform call, + # then take FinalResult. Chunk-based feeding split short "sanad" + # utterances across chunk boundaries and Vosk's decoder often + # refused to commit, returning empty. Single-shot works for every + # voice-assistant example in Vosk's docs. + # + # When FinalResult is empty, also check PartialResult — sometimes + # Vosk heard something but didn't reach a segmentation boundary + # yet. PartialResult still has the text, just not "finalized". + rec.AcceptWaveform(audio.tobytes()) + final = _json.loads(rec.FinalResult()).get("text", "").strip() + if not final: + partial = _json.loads(rec.PartialResult()).get("partial", "").strip() + if partial: + final = partial + log.info(" (partial only, no final commit)") + text = final + + if not text: + log.info("Transcribed: (empty)") + return "" + + log.info("Transcribed: %s", text[:100]) return text def _check_wake_word(self, text: str) -> bool: @@ -298,23 +306,29 @@ class VoiceModule: time.sleep(1) def _do_idle(self): - """Listen for wake word in 2-second chunks.""" + """Listen for wake word in 4-second chunks. Longer windows give + Vosk's decoder enough context to commit short utterances like a + single 'sanad'.""" # Skip if robot is speaking — prevents self-listening if self._audio.is_speaking: time.sleep(0.2) return - audio = self._record_chunk(2.0) + audio = self._record_chunk(4.0) # Double-check speaking didn't start during recording if self._audio.is_speaking: return - # Skip if too quiet (no one talking) - if audio.std() < 100: + # Skip if too quiet (no one talking). Threshold lowered to 60 to + # match the G1 on-board mic's typical noise floor (std ~30-80 when + # idle, ~150+ when someone speaks). With 100 we were skipping + # quiet "sanad" utterances entirely. + if audio.std() < 60: return - text = self._transcribe(audio, self._wake_model) + # Wake-word pass uses restricted Vosk grammar (only "sanad" or "[unk]") + text = self._transcribe(audio, grammar=self._wake_grammar) if self._check_wake_word(text): log.info("Wake word detected!") @@ -330,10 +344,18 @@ class VoiceModule: def _do_wake_heard(self): """Record the command until silence.""" - # Wait for "Listening..." TTS to finish before recording + # Wait for "Yes" TTS to finish before recording. while self._audio.is_speaking: time.sleep(0.1) + # CRITICAL: flush the mic ring buffer. The UDP multicast receiver + # has been accumulating audio continuously (including pre-wake + # silence and the TTS "Yes" that just played back into the mic + # path). Without flush, _record_until_silence() reads the old + # buffered silence instantly, counts 3 silent chunks, and exits + # before the user has started speaking the command. + self._mic_capture.flush() + log.info("Recording command...") audio = self._record_until_silence() @@ -348,7 +370,7 @@ class VoiceModule: def _do_processing(self): """Transcribe the command and send to brain.""" - text = self._transcribe(self._command_audio, self._cmd_model) + text = self._transcribe(self._command_audio) self._command_audio = None if not text or len(text.strip()) < 2: