From f4ff5c27faaafdf83c16f6517a069be8cde73924 Mon Sep 17 00:00:00 2001
From: kassam <kassam@yslootahtech.com>
Date: Wed, 22 Apr 2026 13:47:15 +0400
Subject: [PATCH] Update 2026-04-22 13:47:14

---
 Config/config_Brain.json |  2 +-
 install_ollama_jetson.sh | 17 +++++++++----
 run_marcus.py            | 11 ++++++++
 warmup_vlm.sh            | 54 ++++++++++++++++++++++++++--------------
 4 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/Config/config_Brain.json b/Config/config_Brain.json
index 53f900f..57d90f2 100644
--- a/Config/config_Brain.json
+++ b/Config/config_Brain.json
@@ -2,7 +2,7 @@
   "ollama_model": "qwen2.5vl:3b",
   "ollama_host":  "http://127.0.0.1:11434",
   "max_history": 6,
-  "num_batch": 128,
+  "num_batch": 64,
   "num_ctx": 1024,
   "subsystems": {
     "vlm":        true,
diff --git a/install_ollama_jetson.sh b/install_ollama_jetson.sh
index d5acd2b..d76b3cb 100755
--- a/install_ollama_jetson.sh
+++ b/install_ollama_jetson.sh
@@ -44,11 +44,18 @@ Environment="OLLAMA_MAX_LOADED_MODELS=1"
 # 153 ms and the robot fell. 24h means one cold-load per day (during
 # warmup_vlm.sh while the robot is in squat), everything after is warm.
 Environment="OLLAMA_KEEP_ALIVE=24h"
-# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
-# Holosoma + camera + Python heap). Raised from 2 GiB after observing
-# Holosoma starvation during image requests — 2 GiB was enough for memory
-# but not for memory-bandwidth headroom.
-Environment="OLLAMA_GPU_OVERHEAD=3221225472"
+# Reserve 4 GiB of the 15 GiB iGPU for the rest of the system (YOLO +
+# Holosoma + camera + Python heap + vision-encoder transient spike).
+# Raised from 3 GiB after observing the robot wobble/fall during an
+# `ask()` call — the vision encoder's peak activations were still eating
+# into Holosoma's slack memory.
+Environment="OLLAMA_GPU_OVERHEAD=4294967296"
+# Cap the model's context window at the server level so Ollama's compute
+# graph pre-allocation shrinks from 7.3 GiB (KvSize:2048 default) to
+# ~3-4 GiB (KvSize:1024). Runtime num_ctx in request options is ignored
+# for compute-graph sizing — only this env var affects the load-time
+# allocation. 1024 tokens is enough for our prompts + short answers.
+Environment="OLLAMA_CONTEXT_LENGTH=1024"
 # Deprioritize Ollama so it never preempts Holosoma's real-time locomotion
 # thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when
 # nobody else wants it. Nice=10 = lowest normal priority.
diff --git a/run_marcus.py b/run_marcus.py
index 09f3105..c1e9df4 100644
--- a/run_marcus.py
+++ b/run_marcus.py
@@ -19,6 +19,17 @@ PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
 if PROJECT_ROOT not in sys.path:
     sys.path.insert(0, PROJECT_ROOT)
 
+# Tell the Linux OOM killer to pick Ollama (easily restarted) or any other
+# process before Marcus. -900 is almost-but-not-quite OOM-immune; we don't
+# use -1000 because that disables OOM handling entirely, which is risky if
+# Marcus ever had a runaway allocation. Writing oom_score_adj doesn't need
+# root — a process can always lower its own score.
+try:
+    with open(f"/proc/{os.getpid()}/oom_score_adj", "w") as _f:
+        _f.write("-900")
+except OSError:
+    pass  # not fatal — running under a restrictive sandbox
+
 from Brain.marcus_brain import run_terminal
 
 if __name__ == "__main__":
diff --git a/warmup_vlm.sh b/warmup_vlm.sh
index 556a37f..5a67597 100755
--- a/warmup_vlm.sh
+++ b/warmup_vlm.sh
@@ -24,20 +24,6 @@ set -e
 MODEL="${1:-qwen2.5vl:3b}"
 HOST="http://127.0.0.1:11434"
 
-cat <<'BANNER'
-════════════════════════════════════════════════════════════════════
-  VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
-════════════════════════════════════════════════════════════════════
-
-  The next 60-90 s will hammer disk + memory bandwidth. Holosoma
-  cannot balance through this. If the robot is standing, IT WILL FALL.
-
-  Press ENTER when the robot is safely in squat / damping.
-  Press Ctrl-C to abort.
-
-BANNER
-read -r
-
 # Sanity: is ollama reachable?
 if ! curl -sf "$HOST/api/version" > /dev/null; then
     echo "✗ Ollama is not running on $HOST"
@@ -52,9 +38,35 @@ if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
     exit 1
 fi
 
-echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
-echo "  Watching tegrastats in a second window is useful — GPU should spike."
-echo
+# Is the model ALREADY resident? `ollama ps` lists running models with their
+# keep-alive. If qwen2.5vl:3b is there, a ping is ~1 s and the robot doesn't
+# need to squat for it. Only ask the user to squat if we're about to do the
+# real 60-90 s disk + iGPU hammer.
+ALREADY_WARM=0
+if ollama ps 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
+    ALREADY_WARM=1
+fi
+
+if [[ $ALREADY_WARM -eq 1 ]]; then
+    echo "→ $MODEL is already resident (keep-alive). Pinging to confirm..."
+else
+    cat <<'BANNER'
+════════════════════════════════════════════════════════════════════
+  VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
+════════════════════════════════════════════════════════════════════
+
+  The next 60-90 s will hammer disk + memory bandwidth. Holosoma
+  cannot balance through this. If the robot is standing, IT WILL FALL.
+
+  Press ENTER when the robot is safely in squat / damping.
+  Press Ctrl-C to abort.
+
+BANNER
+    read -r
+    echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
+    echo "  Watching tegrastats in a second window is useful — GPU should spike."
+    echo
+fi
 
 START=$(date +%s)
 
@@ -72,8 +84,12 @@ END=$(date +%s)
 ELAPSED=$((END - START))
 
 if echo "$RESPONSE" | grep -q '"response"'; then
-    echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
-    echo "  You can now stand the robot and run: python3 run_marcus.py"
+    if [[ $ALREADY_WARM -eq 1 ]]; then
+        echo "✓ Model already warm — no cold-load needed (${ELAPSED}s ping)"
+    else
+        echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
+        echo "  You can now stand the robot and run: python3 run_marcus.py"
+    fi
 else
     echo "✗ Warmup failed after ${ELAPSED}s"
     echo "  Response: $RESPONSE"