From 9991e742da01345491adb24e88c9fd06182a14d9 Mon Sep 17 00:00:00 2001
From: kassam <kassam@yslootahtech.com>
Date: Wed, 22 Apr 2026 12:32:55 +0400
Subject: [PATCH] Update 2026-04-22 12:32:54

---
 API/llava_api.py         | 10 ++++++-
 Brain/command_parser.py  | 35 ++++++++++++++++++++++++
 Brain/marcus_brain.py    | 57 ++++++----------------------------------
 Config/config_Brain.json |  3 +--
 4 files changed, 53 insertions(+), 52 deletions(-)

diff --git a/API/llava_api.py b/API/llava_api.py
index 22cd904..20d4003 100644
--- a/API/llava_api.py
+++ b/API/llava_api.py
@@ -50,7 +50,15 @@ VERIFY_PROMPT = _prompts["verify_prompt"]
 
 # Explicit Ollama client — lets us route to a remote host (e.g., workstation)
 # without relying on the OLLAMA_HOST env var being set in the launch shell.
-_client = ollama.Client(host=OLLAMA_HOST)
+#
+# CRITICAL: timeout=300 (5 min). The Python `ollama` library defaults to
+# httpx's short timeout. On the Jetson a cold-load of qwen2.5vl:3b takes
+# 60-90 s; with the default timeout the client disconnects mid-load,
+# Ollama interprets that as "client cancelled", aborts the in-progress
+# load, and starts over on the next request. This caused the repeated
+# OOM crashes — the model was never finishing a single load before being
+# thrown away and re-started.
+_client = ollama.Client(host=OLLAMA_HOST, timeout=300)
 
 # Safe-mode replies used when subsystems.vlm == false
 _VLM_OFF_TALK  = "Scene understanding is disabled — Sanad is in safe mode."
diff --git a/Brain/command_parser.py b/Brain/command_parser.py
index f5eb517..a166e28 100644
--- a/Brain/command_parser.py
+++ b/Brain/command_parser.py
@@ -27,6 +27,15 @@ _RE_WALK_BACK = re.compile(
     r"^(?:walk|go|move)\s+backward?\s+(\d+(?:\.\d+)?)\s*m(?:eter(?:s)?)?$", re.I)
 _RE_TURN_DEG = re.compile(
     r"^turn\s+(?:(left|right)\s+)?(\d+(?:\.\d+)?)\s*deg(?:ree(?:s)?)?$", re.I)
+# Step-based motion: "walk 1 step", "walk forward 2 steps", "move back 1 step",
+# "turn left", "turn right 2 steps". Kept local so these never fall through to
+# the VLM — on the Jetson the cold-load is 60-90 s and we don't want to pay
+# that for a trivial two-second motion. One step = 2 s of motion at the default
+# velocity, matching the undo-loop duration already used below.
+_RE_WALK_STEP = re.compile(
+    r"^(?:walk|go|move|step)(?:\s+(forward|back(?:ward)?))?\s+(\d+)\s*steps?$", re.I)
+_RE_TURN_STEP = re.compile(
+    r"^turn\s+(left|right)(?:\s+(\d+)\s*steps?)?$", re.I)
 _RE_PATROL_RT = re.compile(
     r"^patrol[/:]\s*(.+)$", re.I)
 _RE_LAST_CMD = re.compile(
@@ -144,6 +153,32 @@ def try_local_command(cmd: str) -> bool:
             gradual_stop()
         return True
 
+    m = _RE_WALK_STEP.match(cmd)
+    if m:
+        direction = (m.group(1) or "forward").lower()
+        steps = int(m.group(2))
+        vx = -0.2 if direction.startswith("back") else 0.3
+        duration = 2.0 * steps
+        t0 = time.time()
+        while time.time() - t0 < duration:
+            send_vel(vx=vx)
+            time.sleep(0.05)
+        gradual_stop()
+        return True
+
+    m = _RE_TURN_STEP.match(cmd)
+    if m:
+        direction = m.group(1).lower()
+        steps = int(m.group(2)) if m.group(2) else 1
+        vyaw = 0.3 if direction == "left" else -0.3
+        duration = 2.0 * steps
+        t0 = time.time()
+        while time.time() - t0 < duration:
+            send_vel(vyaw=vyaw)
+            time.sleep(0.05)
+        gradual_stop()
+        return True
+
     # ── NAMED PATROL ROUTE ───────────────────────────────────────────────
     m = _RE_PATROL_RT.match(cmd)
     if m:
diff --git a/Brain/marcus_brain.py b/Brain/marcus_brain.py
index bb02a24..c88ed4f 100644
--- a/Brain/marcus_brain.py
+++ b/Brain/marcus_brain.py
@@ -147,19 +147,19 @@ def init_brain():
 
     _log("Brain initialized", "info", "brain")
 
-    # Skip warmup when VLM is off — there's no model to warm, and the
-    # dashboard should mention that Marcus is in safe mode.
+    # Report VLM config only — no warmup thread. This matches Marcus_v1's
+    # concept: the first real VLM command performs the cold-load synchronously
+    # inside ollama.chat(), which takes ~60-90 s once on the Jetson and is
+    # fast for every subsequent call. A background warmup thread races with
+    # YOLO/camera/audio/Holosoma startup and with user input, and on a
+    # 16 GB unified-memory board that race is what triggers the OOM killer.
     from API.llava_api import VLM_ENABLED, OLLAMA_HOST
     if not VLM_ENABLED:
         print("  [VLM] disabled by config — safe mode (no Ollama load)")
     else:
         host_short = OLLAMA_HOST.replace("http://", "")
-        print(f"  [VLM] target: {host_short}  ({OLLAMA_MODEL})")
-        # Warmup runs in a daemon thread so the dashboard + Command: prompt
-        # appear immediately. The first real user command will either hit a
-        # warm model (fast) or pay the cold-load itself (same as before).
-        import threading as _t
-        _t.Thread(target=_warmup_llava, daemon=True, name="llava-warmup").start()
+        print(f"  [VLM] target: {host_short}  ({OLLAMA_MODEL})  "
+              f"— first vision command will cold-load (~60-90 s)")
 
 
 # Global voice references
@@ -419,47 +419,6 @@ def _strip_ansi(s: str) -> str:
     return _re.sub(r"\x1b\[[0-9;]*m", "", s)
 
 
-def _warmup_llava():
-    """
-    Runs in a daemon thread — primes the Ollama model into VRAM so the first
-    user command doesn't pay the ~15-20 s cold-load. Tolerates client-timeout
-    on the first attempt (common on the 16 GB Jetson when the compute graph
-    needs a second pass to fit) by doing one silent retry.
-    """
-    import ollama
-    # Quiet heartbeat in the terminal so the operator knows something is happening
-    # without flooding stdout once the banner prints.
-    print("  [Warmup] Loading Qwen2.5-VL into GPU...")
-    base_options = {
-        "temperature": 0.0,
-        "num_predict": _cfg["warmup_num_predict"],
-        # Honor the same compute-graph caps everything else uses, otherwise
-        # Ollama reverts to batch=512/ctx=4096 for this call and OOMs.
-        "num_batch":   _cfg.get("num_batch", 128),
-        "num_ctx":     _cfg.get("num_ctx",   2048),
-    }
-    for attempt in (1, 2):
-        try:
-            img = get_frame()
-            ollama.chat(
-                model=OLLAMA_MODEL,
-                messages=[{"role": "user", "content": "hi",
-                           "images": [img] if img else []}],
-                options=base_options,
-            )
-            print("  [Warmup] Ready — first command will be fast")
-            return
-        except Exception as e:
-            if attempt == 1:
-                # Cold-load frequently times out on attempt #1 while Ollama
-                # is still allocating the compute graph. The model stays
-                # loaded though, so attempt #2 almost always succeeds.
-                print(f"  [Warmup] first attempt timed out, retrying...")
-                continue
-            print(f"  [Warmup] failed after retry ({e}) — first real command "
-                  f"will pay the cold-load (~15-20 s)")
-
-
 def get_brain_status() -> dict:
     """Return current brain status for server status message."""
     from API.yolo_api import YOLO_AVAILABLE as _ya
diff --git a/Config/config_Brain.json b/Config/config_Brain.json
index cf85c53..65d14ad 100644
--- a/Config/config_Brain.json
+++ b/Config/config_Brain.json
@@ -15,6 +15,5 @@
   "num_predict_goal": 80,
   "num_predict_patrol": 100,
   "num_predict_talk": 80,
-  "num_predict_verify": 10,
-  "warmup_num_predict": 5
+  "num_predict_verify": 10
 }