From 9991e742da01345491adb24e88c9fd06182a14d9 Mon Sep 17 00:00:00 2001 From: kassam Date: Wed, 22 Apr 2026 12:32:55 +0400 Subject: [PATCH] Update 2026-04-22 12:32:54 --- API/llava_api.py | 10 ++++++- Brain/command_parser.py | 35 ++++++++++++++++++++++++ Brain/marcus_brain.py | 57 ++++++---------------------------------- Config/config_Brain.json | 3 +-- 4 files changed, 53 insertions(+), 52 deletions(-) diff --git a/API/llava_api.py b/API/llava_api.py index 22cd904..20d4003 100644 --- a/API/llava_api.py +++ b/API/llava_api.py @@ -50,7 +50,15 @@ VERIFY_PROMPT = _prompts["verify_prompt"] # Explicit Ollama client — lets us route to a remote host (e.g., workstation) # without relying on the OLLAMA_HOST env var being set in the launch shell. -_client = ollama.Client(host=OLLAMA_HOST) +# +# CRITICAL: timeout=300 (5 min). The Python `ollama` library defaults to +# httpx's short timeout. On the Jetson a cold-load of qwen2.5vl:3b takes +# 60-90 s; with the default timeout the client disconnects mid-load, +# Ollama interprets that as "client cancelled", aborts the in-progress +# load, and starts over on the next request. This caused the repeated +# OOM crashes — the model was never finishing a single load before being +# thrown away and re-started. +_client = ollama.Client(host=OLLAMA_HOST, timeout=300) # Safe-mode replies used when subsystems.vlm == false _VLM_OFF_TALK = "Scene understanding is disabled — Sanad is in safe mode." diff --git a/Brain/command_parser.py b/Brain/command_parser.py index f5eb517..a166e28 100644 --- a/Brain/command_parser.py +++ b/Brain/command_parser.py @@ -27,6 +27,15 @@ _RE_WALK_BACK = re.compile( r"^(?:walk|go|move)\s+backward?\s+(\d+(?:\.\d+)?)\s*m(?:eter(?:s)?)?$", re.I) _RE_TURN_DEG = re.compile( r"^turn\s+(?:(left|right)\s+)?(\d+(?:\.\d+)?)\s*deg(?:ree(?:s)?)?$", re.I) +# Step-based motion: "walk 1 step", "walk forward 2 steps", "move back 1 step", +# "turn left", "turn right 2 steps". Kept local so these never fall through to +# the VLM — on the Jetson the cold-load is 60-90 s and we don't want to pay +# that for a trivial two-second motion. One step = 2 s of motion at the default +# velocity, matching the undo-loop duration already used below. +_RE_WALK_STEP = re.compile( + r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I) +_RE_TURN_STEP = re.compile( + r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I) _RE_PATROL_RT = re.compile( r"^patrol[/:]\s*(.+)$", re.I) _RE_LAST_CMD = re.compile( @@ -144,6 +153,32 @@ def try_local_command(cmd: str) -> bool: gradual_stop() return True + m = _RE_WALK_STEP.match(cmd) + if m: + direction = (m.group(1) or "forward").lower() + steps = int(m.group(2)) + vx = -0.2 if direction.startswith("back") else 0.3 + duration = 2.0 * steps + t0 = time.time() + while time.time() - t0 < duration: + send_vel(vx=vx) + time.sleep(0.05) + gradual_stop() + return True + + m = _RE_TURN_STEP.match(cmd) + if m: + direction = m.group(1).lower() + steps = int(m.group(2)) if m.group(2) else 1 + vyaw = 0.3 if direction == "left" else -0.3 + duration = 2.0 * steps + t0 = time.time() + while time.time() - t0 < duration: + send_vel(vyaw=vyaw) + time.sleep(0.05) + gradual_stop() + return True + # ── NAMED PATROL ROUTE ─────────────────────────────────────────────── m = _RE_PATROL_RT.match(cmd) if m: diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py index bb02a24..c88ed4f 100644 --- a/Brain/marcus_brain.py +++ b/Brain/marcus_brain.py @@ -147,19 +147,19 @@ def init_brain(): _log("Brain initialized", "info", "brain") - # Skip warmup when VLM is off — there's no model to warm, and the - # dashboard should mention that Marcus is in safe mode. + # Report VLM config only — no warmup thread. This matches Marcus_v1's + # concept: the first real VLM command performs the cold-load synchronously + # inside ollama.chat(), which takes ~60-90 s once on the Jetson and is + # fast for every subsequent call. A background warmup thread races with + # YOLO/camera/audio/Holosoma startup and with user input, and on a + # 16 GB unified-memory board that race is what triggers the OOM killer. from API.llava_api import VLM_ENABLED, OLLAMA_HOST if not VLM_ENABLED: print(" [VLM] disabled by config — safe mode (no Ollama load)") else: host_short = OLLAMA_HOST.replace("http://", "") - print(f" [VLM] target: {host_short} ({OLLAMA_MODEL})") - # Warmup runs in a daemon thread so the dashboard + Command: prompt - # appear immediately. The first real user command will either hit a - # warm model (fast) or pay the cold-load itself (same as before). - import threading as _t - _t.Thread(target=_warmup_llava, daemon=True, name="llava-warmup").start() + print(f" [VLM] target: {host_short} ({OLLAMA_MODEL}) " + f"— first vision command will cold-load (~60-90 s)") # Global voice references @@ -419,47 +419,6 @@ def _strip_ansi(s: str) -> str: return _re.sub(r"\x1b\[[0-9;]*m", "", s) -def _warmup_llava(): - """ - Runs in a daemon thread — primes the Ollama model into VRAM so the first - user command doesn't pay the ~15-20 s cold-load. Tolerates client-timeout - on the first attempt (common on the 16 GB Jetson when the compute graph - needs a second pass to fit) by doing one silent retry. - """ - import ollama - # Quiet heartbeat in the terminal so the operator knows something is happening - # without flooding stdout once the banner prints. - print(" [Warmup] Loading Qwen2.5-VL into GPU...") - base_options = { - "temperature": 0.0, - "num_predict": _cfg["warmup_num_predict"], - # Honor the same compute-graph caps everything else uses, otherwise - # Ollama reverts to batch=512/ctx=4096 for this call and OOMs. - "num_batch": _cfg.get("num_batch", 128), - "num_ctx": _cfg.get("num_ctx", 2048), - } - for attempt in (1, 2): - try: - img = get_frame() - ollama.chat( - model=OLLAMA_MODEL, - messages=[{"role": "user", "content": "hi", - "images": [img] if img else []}], - options=base_options, - ) - print(" [Warmup] Ready — first command will be fast") - return - except Exception as e: - if attempt == 1: - # Cold-load frequently times out on attempt #1 while Ollama - # is still allocating the compute graph. The model stays - # loaded though, so attempt #2 almost always succeeds. - print(f" [Warmup] first attempt timed out, retrying...") - continue - print(f" [Warmup] failed after retry ({e}) — first real command " - f"will pay the cold-load (~15-20 s)") - - def get_brain_status() -> dict: """Return current brain status for server status message.""" from API.yolo_api import YOLO_AVAILABLE as _ya diff --git a/Config/config_Brain.json b/Config/config_Brain.json index cf85c53..65d14ad 100644 --- a/Config/config_Brain.json +++ b/Config/config_Brain.json @@ -15,6 +15,5 @@ "num_predict_goal": 80, "num_predict_patrol": 100, "num_predict_talk": 80, - "num_predict_verify": 10, - "warmup_num_predict": 5 + "num_predict_verify": 10 }