From f4ff5c27faaafdf83c16f6517a069be8cde73924 Mon Sep 17 00:00:00 2001 From: kassam Date: Wed, 22 Apr 2026 13:47:15 +0400 Subject: [PATCH] Update 2026-04-22 13:47:14 --- Config/config_Brain.json | 2 +- install_ollama_jetson.sh | 17 +++++++++---- run_marcus.py | 11 ++++++++ warmup_vlm.sh | 54 ++++++++++++++++++++++++++-------------- 4 files changed, 59 insertions(+), 25 deletions(-) diff --git a/Config/config_Brain.json b/Config/config_Brain.json index 53f900f..57d90f2 100644 --- a/Config/config_Brain.json +++ b/Config/config_Brain.json @@ -2,7 +2,7 @@ "ollama_model": "qwen2.5vl:3b", "ollama_host": "http://127.0.0.1:11434", "max_history": 6, - "num_batch": 128, + "num_batch": 64, "num_ctx": 1024, "subsystems": { "vlm": true, diff --git a/install_ollama_jetson.sh b/install_ollama_jetson.sh index d5acd2b..d76b3cb 100755 --- a/install_ollama_jetson.sh +++ b/install_ollama_jetson.sh @@ -44,11 +44,18 @@ Environment="OLLAMA_MAX_LOADED_MODELS=1" # 153 ms and the robot fell. 24h means one cold-load per day (during # warmup_vlm.sh while the robot is in squat), everything after is warm. Environment="OLLAMA_KEEP_ALIVE=24h" -# Reserve 3 GiB of the 15 GiB iGPU for the rest of the system (YOLO + -# Holosoma + camera + Python heap). Raised from 2 GiB after observing -# Holosoma starvation during image requests — 2 GiB was enough for memory -# but not for memory-bandwidth headroom. -Environment="OLLAMA_GPU_OVERHEAD=3221225472" +# Reserve 4 GiB of the 15 GiB iGPU for the rest of the system (YOLO + +# Holosoma + camera + Python heap + vision-encoder transient spike). +# Raised from 3 GiB after observing the robot wobble/fall during an +# `ask()` call — the vision encoder's peak activations were still eating +# into Holosoma's slack memory. +Environment="OLLAMA_GPU_OVERHEAD=4294967296" +# Cap the model's context window at the server level so Ollama's compute +# graph pre-allocation shrinks from 7.3 GiB (KvSize:2048 default) to +# ~3-4 GiB (KvSize:1024). Runtime num_ctx in request options is ignored +# for compute-graph sizing — only this env var affects the load-time +# allocation. 1024 tokens is enough for our prompts + short answers. +Environment="OLLAMA_CONTEXT_LENGTH=1024" # Deprioritize Ollama so it never preempts Holosoma's real-time locomotion # thread. IOSchedulingClass=idle = Ollama only gets disk bandwidth when # nobody else wants it. Nice=10 = lowest normal priority. diff --git a/run_marcus.py b/run_marcus.py index 09f3105..c1e9df4 100644 --- a/run_marcus.py +++ b/run_marcus.py @@ -19,6 +19,17 @@ PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__)) if PROJECT_ROOT not in sys.path: sys.path.insert(0, PROJECT_ROOT) +# Tell the Linux OOM killer to pick Ollama (easily restarted) or any other +# process before Marcus. -900 is almost-but-not-quite OOM-immune; we don't +# use -1000 because that disables OOM handling entirely, which is risky if +# Marcus ever had a runaway allocation. Writing oom_score_adj doesn't need +# root — a process can always lower its own score. +try: + with open(f"/proc/{os.getpid()}/oom_score_adj", "w") as _f: + _f.write("-900") +except OSError: + pass # not fatal — running under a restrictive sandbox + from Brain.marcus_brain import run_terminal if __name__ == "__main__": diff --git a/warmup_vlm.sh b/warmup_vlm.sh index 556a37f..5a67597 100755 --- a/warmup_vlm.sh +++ b/warmup_vlm.sh @@ -24,20 +24,6 @@ set -e MODEL="${1:-qwen2.5vl:3b}" HOST="http://127.0.0.1:11434" -cat <<'BANNER' -════════════════════════════════════════════════════════════════════ - VLM WARMUP — put the robot in SQUAT / DAMPING mode first! -════════════════════════════════════════════════════════════════════ - - The next 60-90 s will hammer disk + memory bandwidth. Holosoma - cannot balance through this. If the robot is standing, IT WILL FALL. - - Press ENTER when the robot is safely in squat / damping. - Press Ctrl-C to abort. - -BANNER -read -r - # Sanity: is ollama reachable? if ! curl -sf "$HOST/api/version" > /dev/null; then echo "✗ Ollama is not running on $HOST" @@ -52,9 +38,35 @@ if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then exit 1 fi -echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..." -echo " Watching tegrastats in a second window is useful — GPU should spike." -echo +# Is the model ALREADY resident? `ollama ps` lists running models with their +# keep-alive. If qwen2.5vl:3b is there, a ping is ~1 s and the robot doesn't +# need to squat for it. Only ask the user to squat if we're about to do the +# real 60-90 s disk + iGPU hammer. +ALREADY_WARM=0 +if ollama ps 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then + ALREADY_WARM=1 +fi + +if [[ $ALREADY_WARM -eq 1 ]]; then + echo "→ $MODEL is already resident (keep-alive). Pinging to confirm..." +else + cat <<'BANNER' +════════════════════════════════════════════════════════════════════ + VLM WARMUP — put the robot in SQUAT / DAMPING mode first! +════════════════════════════════════════════════════════════════════ + + The next 60-90 s will hammer disk + memory bandwidth. Holosoma + cannot balance through this. If the robot is standing, IT WILL FALL. + + Press ENTER when the robot is safely in squat / damping. + Press Ctrl-C to abort. + +BANNER + read -r + echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..." + echo " Watching tegrastats in a second window is useful — GPU should spike." + echo +fi START=$(date +%s) @@ -72,8 +84,12 @@ END=$(date +%s) ELAPSED=$((END - START)) if echo "$RESPONSE" | grep -q '"response"'; then - echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h" - echo " You can now stand the robot and run: python3 run_marcus.py" + if [[ $ALREADY_WARM -eq 1 ]]; then + echo "✓ Model already warm — no cold-load needed (${ELAPSED}s ping)" + else + echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h" + echo " You can now stand the robot and run: python3 run_marcus.py" + fi else echo "✗ Warmup failed after ${ELAPSED}s" echo " Response: $RESPONSE"