110 lines
4.8 KiB
Bash
Executable File
110 lines
4.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up.
|
|
#
|
|
# WHY THIS EXISTS:
|
|
# Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory
|
|
# hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms
|
|
# real-time deadline. If that happens while the robot is standing, Holosoma
|
|
# loses balance control and the robot falls (observed 2026-04-22 —
|
|
# RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma).
|
|
#
|
|
# SAFE OPERATING PROCEDURE (do this once per boot):
|
|
# 1. Boot robot. Keep it in squat / damping mode (NOT standing).
|
|
# 2. Run: ./warmup_vlm.sh
|
|
# 3. Wait for "Warmup complete" (~60-90 s).
|
|
# 4. NOW raise the robot to standing.
|
|
# 5. Run: python3 run_marcus.py
|
|
#
|
|
# After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the
|
|
# day, so every "what do you see" is fast (~2-5 s) and Holosoma stays
|
|
# healthy. Never cold-load a vision model while the robot is standing.
|
|
|
|
set -e
|
|
|
|
MODEL="${1:-qwen2.5vl:3b}"
|
|
HOST="http://127.0.0.1:11434"
|
|
|
|
# Pull num_batch / num_ctx from config_Brain.json. CRITICAL: these must match
|
|
# what Marcus will send at runtime, otherwise Ollama evicts the cached runner
|
|
# on the FIRST real vision command and re-instantiates it — the overlap of
|
|
# old-runner-unloading + new-runner-loading triggers an iGPU OOM and the
|
|
# runner dies with status 500. Symptom: warmup succeeds, first `what do you
|
|
# see` crashes 10-30 s later with "llama runner process has terminated".
|
|
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
NUM_BATCH=$(python3 -c "import json; print(json.load(open('$SCRIPT_DIR/Config/config_Brain.json'))['num_batch'])" 2>/dev/null || echo 64)
|
|
NUM_CTX=$(python3 -c "import json; print(json.load(open('$SCRIPT_DIR/Config/config_Brain.json'))['num_ctx'])" 2>/dev/null || echo 1024)
|
|
|
|
# Sanity: is ollama reachable?
|
|
if ! curl -sf "$HOST/api/version" > /dev/null; then
|
|
echo "✗ Ollama is not running on $HOST"
|
|
echo " Start it: sudo systemctl start ollama"
|
|
exit 1
|
|
fi
|
|
|
|
# Sanity: is the model in the store?
|
|
if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
|
|
echo "✗ Model '$MODEL' not found in Ollama store"
|
|
echo " Pull it: ollama pull $MODEL"
|
|
exit 1
|
|
fi
|
|
|
|
# Is the model ALREADY resident? `ollama ps` lists running models with their
|
|
# keep-alive. If qwen2.5vl:3b is there, a ping is ~1 s and the robot doesn't
|
|
# need to squat for it. Only ask the user to squat if we're about to do the
|
|
# real 60-90 s disk + iGPU hammer.
|
|
ALREADY_WARM=0
|
|
if ollama ps 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
|
|
ALREADY_WARM=1
|
|
fi
|
|
|
|
if [[ $ALREADY_WARM -eq 1 ]]; then
|
|
echo "→ $MODEL is already resident (keep-alive). Pinging to confirm..."
|
|
else
|
|
cat <<'BANNER'
|
|
════════════════════════════════════════════════════════════════════
|
|
VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
|
|
════════════════════════════════════════════════════════════════════
|
|
|
|
The next 60-90 s will hammer disk + memory bandwidth. Holosoma
|
|
cannot balance through this. If the robot is standing, IT WILL FALL.
|
|
|
|
Press ENTER when the robot is safely in squat / damping.
|
|
Press Ctrl-C to abort.
|
|
|
|
BANNER
|
|
read -r
|
|
echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
|
|
echo " num_batch=$NUM_BATCH num_ctx=$NUM_CTX (matching Marcus's runtime config)"
|
|
echo " Watching tegrastats in a second window is useful — GPU should spike."
|
|
echo
|
|
fi
|
|
|
|
START=$(date +%s)
|
|
|
|
# Send a tiny text-only request. Ollama loads the model on first request;
|
|
# we don't need vision here — just getting weights resident is what takes the
|
|
# long time. A text-only warmup also avoids needing a camera frame.
|
|
#
|
|
# --keepalive 24h matches the systemd config and prevents the server from
|
|
# unloading the model after the default 5 min.
|
|
RESPONSE=$(curl -s "$HOST/api/generate" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":$NUM_BATCH,\"num_ctx\":$NUM_CTX}}")
|
|
|
|
END=$(date +%s)
|
|
ELAPSED=$((END - START))
|
|
|
|
if echo "$RESPONSE" | grep -q '"response"'; then
|
|
if [[ $ALREADY_WARM -eq 1 ]]; then
|
|
echo "✓ Model already warm — no cold-load needed (${ELAPSED}s ping)"
|
|
else
|
|
echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
|
|
echo " You can now stand the robot and run: python3 run_marcus.py"
|
|
fi
|
|
else
|
|
echo "✗ Warmup failed after ${ELAPSED}s"
|
|
echo " Response: $RESPONSE"
|
|
echo " Check: journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40"
|
|
exit 1
|
|
fi
|