Marcus/warmup_vlm.sh

#!/usr/bin/env bash
# warmup_vlm.sh — pre-load Qwen2.5-VL into iGPU BEFORE the robot stands up.
#
# WHY THIS EXISTS:
# Cold-loading a 3 GB vision model on the Jetson's 16 GB unified memory
# hammers disk + memory bandwidth hard enough to blow Holosoma's 20 ms
# real-time deadline. If that happens while the robot is standing, Holosoma
# loses balance control and the robot falls (observed 2026-04-22 —
# RL FPS crashed from 56 → 10 during cold-load, OOM killer reaped Holosoma).
#
# SAFE OPERATING PROCEDURE (do this once per boot):
#   1. Boot robot. Keep it in squat / damping mode (NOT standing).
#   2. Run: ./warmup_vlm.sh
#   3. Wait for "Warmup complete" (~60-90 s).
#   4. NOW raise the robot to standing.
#   5. Run: python3 run_marcus.py
#
# After warmup, OLLAMA_KEEP_ALIVE=24h keeps the model hot for the rest of the
# day, so every "what do you see" is fast (~2-5 s) and Holosoma stays
# healthy. Never cold-load a vision model while the robot is standing.

set -e

MODEL="${1:-qwen2.5vl:3b}"
HOST="http://127.0.0.1:11434"

cat <<'BANNER'
════════════════════════════════════════════════════════════════════
  VLM WARMUP — put the robot in SQUAT / DAMPING mode first!
════════════════════════════════════════════════════════════════════

  The next 60-90 s will hammer disk + memory bandwidth. Holosoma
  cannot balance through this. If the robot is standing, IT WILL FALL.

  Press ENTER when the robot is safely in squat / damping.
  Press Ctrl-C to abort.

BANNER
read -r

# Sanity: is ollama reachable?
if ! curl -sf "$HOST/api/version" > /dev/null; then
    echo "✗ Ollama is not running on $HOST"
    echo "  Start it:  sudo systemctl start ollama"
    exit 1
fi

# Sanity: is the model in the store?
if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
    echo "✗ Model '$MODEL' not found in Ollama store"
    echo "  Pull it:   ollama pull $MODEL"
    exit 1
fi

echo "→ Cold-loading $MODEL into iGPU (60-90 s, do not interrupt)..."
echo "  Watching tegrastats in a second window is useful — GPU should spike."
echo

START=$(date +%s)

# Send a tiny text-only request. Ollama loads the model on first request;
# we don't need vision here — just getting weights resident is what takes the
# long time. A text-only warmup also avoids needing a camera frame.
#
# --keepalive 24h matches the systemd config and prevents the server from
# unloading the model after the default 5 min.
RESPONSE=$(curl -s "$HOST/api/generate" \
    -H 'Content-Type: application/json' \
    -d "{\"model\":\"$MODEL\",\"prompt\":\"ok\",\"stream\":false,\"keep_alive\":\"24h\",\"options\":{\"num_predict\":1,\"num_batch\":128,\"num_ctx\":1024}}")

END=$(date +%s)
ELAPSED=$((END - START))

if echo "$RESPONSE" | grep -q '"response"'; then
    echo "✓ Warmup complete in ${ELAPSED}s — model is resident for 24h"
    echo "  You can now stand the robot and run: python3 run_marcus.py"
else
    echo "✗ Warmup failed after ${ELAPSED}s"
    echo "  Response: $RESPONSE"
    echo "  Check:    journalctl -u ollama --since '2 minutes ago' --no-pager | tail -40"
    exit 1
fi